Index: ps/trunk/source/renderer/backend/IDevice.h =================================================================== --- ps/trunk/source/renderer/backend/IDevice.h (revision 28009) +++ ps/trunk/source/renderer/backend/IDevice.h (revision 28010) @@ -1,180 +1,187 @@ -/* Copyright (C) 2023 Wildfire Games. +/* Copyright (C) 2024 Wildfire Games. * This file is part of 0 A.D. * * 0 A.D. is free software: you can redistribute it and/or modify * it under the terms of the GNU General Public License as published by * the Free Software Foundation, either version 2 of the License, or * (at your option) any later version. * * 0 A.D. is distributed in the hope that it will be useful, * but WITHOUT ANY WARRANTY; without even the implied warranty of * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the * GNU General Public License for more details. * * You should have received a copy of the GNU General Public License * along with 0 A.D. If not, see . */ #ifndef INCLUDED_RENDERER_BACKEND_IDEVICE #define INCLUDED_RENDERER_BACKEND_IDEVICE #include "graphics/Color.h" #include "ps/containers/Span.h" #include "renderer/backend/Backend.h" #include "renderer/backend/Format.h" #include "renderer/backend/IBuffer.h" #include "renderer/backend/IDevice.h" #include "renderer/backend/IDeviceCommandContext.h" #include "renderer/backend/IFramebuffer.h" #include "renderer/backend/IShaderProgram.h" #include "renderer/backend/ITexture.h" #include "renderer/backend/PipelineState.h" #include "scriptinterface/ScriptForward.h" #include #include #include class CShaderDefines; class CStr; namespace Renderer { namespace Backend { class IDevice { public: struct Capabilities { bool S3TC; bool ARBShaders; bool ARBShadersShadow; bool computeShaders; bool debugLabels; bool debugScopedLabels; bool multisampling; bool anisotropicFiltering; uint32_t maxSampleCount; float maxAnisotropy; uint32_t maxTextureSize; bool instancing; }; virtual ~IDevice() {} virtual Backend GetBackend() const = 0; virtual const std::string& GetName() const = 0; virtual const std::string& GetVersion() const = 0; virtual const std::string& GetDriverInformation() const = 0; virtual const std::vector& GetExtensions() const = 0; virtual void Report(const ScriptRequest& rq, JS::HandleValue settings) = 0; virtual std::unique_ptr CreateCommandContext() = 0; /** * Creates a graphics pipeline state. It's a caller responsibility to * guarantee a lifespan of IShaderProgram stored in the description. */ virtual std::unique_ptr CreateGraphicsPipelineState( const SGraphicsPipelineStateDesc& pipelineStateDesc) = 0; /** + * Creates a compute pipeline state. It's a caller responsibility to + * guarantee a lifespan of IShaderProgram stored in the description. + */ + virtual std::unique_ptr CreateComputePipelineState( + const SComputePipelineStateDesc& pipelineStateDesc) = 0; + + /** * Creates a vertex input layout. It's recommended to use as few different * layouts as posible. */ virtual std::unique_ptr CreateVertexInputLayout( const PS::span attributes) = 0; virtual std::unique_ptr CreateTexture( const char* name, const ITexture::Type type, const uint32_t usage, const Format format, const uint32_t width, const uint32_t height, const Sampler::Desc& defaultSamplerDesc, const uint32_t MIPLevelCount, const uint32_t sampleCount) = 0; virtual std::unique_ptr CreateTexture2D( const char* name, const uint32_t usage, const Format format, const uint32_t width, const uint32_t height, const Sampler::Desc& defaultSamplerDesc, const uint32_t MIPLevelCount = 1, const uint32_t sampleCount = 1) = 0; /** * @see IFramebuffer * * The color attachment and the depth-stencil attachment should not be * nullptr at the same time. There should not be many different clear * colors along all color attachments for all framebuffers created for * the device. * * @return A valid framebuffer if it was created successfully else nullptr. */ virtual std::unique_ptr CreateFramebuffer( const char* name, SColorAttachment* colorAttachment, SDepthStencilAttachment* depthStencilAttachment) = 0; virtual std::unique_ptr CreateBuffer( const char* name, const IBuffer::Type type, const uint32_t size, const bool dynamic) = 0; virtual std::unique_ptr CreateShaderProgram( const CStr& name, const CShaderDefines& defines) = 0; /** * Acquires a backbuffer for rendering a frame. * * @return True if it was successfully acquired and we can render to it. */ virtual bool AcquireNextBackbuffer() = 0; /** * Returns a framebuffer for the current backbuffer with the required * attachment operations. It should not be called if the last * AcquireNextBackbuffer call returned false. * * It's guaranteed that for the same acquired backbuffer this function returns * a framebuffer with the same attachments and properties except load and * store operations. * * @return The last successfully acquired framebuffer that wasn't * presented. */ virtual IFramebuffer* GetCurrentBackbuffer( const AttachmentLoadOp colorAttachmentLoadOp, const AttachmentStoreOp colorAttachmentStoreOp, const AttachmentLoadOp depthStencilAttachmentLoadOp, const AttachmentStoreOp depthStencilAttachmentStoreOp) = 0; /** * Presents the backbuffer to the swapchain queue to be flipped on a * screen. Should be called only if the last AcquireNextBackbuffer call * returned true. */ virtual void Present() = 0; /** * Should be called on window surface resize. It's the device owner * responsibility to call that function. Shouldn't be called during * rendering to an acquired backbuffer. */ virtual void OnWindowResize(const uint32_t width, const uint32_t height) = 0; virtual bool IsTextureFormatSupported(const Format format) const = 0; virtual bool IsFramebufferFormatSupported(const Format format) const = 0; /** * Returns the most suitable format for the usage. Returns * Format::UNDEFINED if there is no such format. */ virtual Format GetPreferredDepthStencilFormat( const uint32_t usage, const bool depth, const bool stencil) const = 0; virtual const Capabilities& GetCapabilities() const = 0; }; } // namespace Backend } // namespace Renderer #endif // INCLUDED_RENDERER_BACKEND_IDEVICE Index: ps/trunk/source/graphics/ShaderManager.cpp =================================================================== --- ps/trunk/source/graphics/ShaderManager.cpp (revision 28009) +++ ps/trunk/source/graphics/ShaderManager.cpp (revision 28010) @@ -1,499 +1,522 @@ -/* Copyright (C) 2023 Wildfire Games. +/* Copyright (C) 2024 Wildfire Games. * This file is part of 0 A.D. * * 0 A.D. is free software: you can redistribute it and/or modify * it under the terms of the GNU General Public License as published by * the Free Software Foundation, either version 2 of the License, or * (at your option) any later version. * * 0 A.D. is distributed in the hope that it will be useful, * but WITHOUT ANY WARRANTY; without even the implied warranty of * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the * GNU General Public License for more details. * * You should have received a copy of the GNU General Public License * along with 0 A.D. If not, see . */ #include "precompiled.h" #include "ShaderManager.h" #include "graphics/PreprocessorWrapper.h" #include "graphics/ShaderTechnique.h" #include "lib/config2.h" #include "lib/hash.h" #include "lib/timer.h" #include "lib/utf8.h" #include "ps/CLogger.h" #include "ps/CStrIntern.h" #include "ps/CStrInternStatic.h" #include "ps/Filesystem.h" #include "ps/Profile.h" #include "ps/XML/Xeromyces.h" #include "renderer/backend/IDevice.h" #define USE_SHADER_XML_VALIDATION 1 #if USE_SHADER_XML_VALIDATION #include "ps/XML/RelaxNG.h" #include "ps/XML/XMLWriter.h" #endif #include #include TIMER_ADD_CLIENT(tc_ShaderValidation); CShaderManager::CShaderManager(Renderer::Backend::IDevice* device) : m_Device(device) { #if USE_SHADER_XML_VALIDATION { TIMER_ACCRUE(tc_ShaderValidation); if (!CXeromyces::AddValidator(g_VFS, "shader", "shaders/program.rng")) LOGERROR("CShaderManager: failed to load grammar shaders/program.rng"); } #endif // Allow hotloading of textures RegisterFileReloadFunc(ReloadChangedFileCB, this); } CShaderManager::~CShaderManager() { UnregisterFileReloadFunc(ReloadChangedFileCB, this); } CShaderProgramPtr CShaderManager::LoadProgram(const CStr& name, const CShaderDefines& defines) { CacheKey key = { name, defines }; std::map::iterator it = m_ProgramCache.find(key); if (it != m_ProgramCache.end()) return it->second; CShaderProgramPtr program = CShaderProgram::Create(m_Device, name, defines); if (program) { for (const VfsPath& path : program->GetFileDependencies()) AddProgramFileDependency(program, path); } else { LOGERROR("Failed to load shader '%s'", name); } m_ProgramCache[key] = program; return program; } size_t CShaderManager::EffectCacheKeyHash::operator()(const EffectCacheKey& key) const { size_t hash = 0; hash_combine(hash, key.name.GetHash()); hash_combine(hash, key.defines.GetHash()); return hash; } bool CShaderManager::EffectCacheKey::operator==(const EffectCacheKey& b) const { return name == b.name && defines == b.defines; } CShaderTechniquePtr CShaderManager::LoadEffect(CStrIntern name) { return LoadEffect(name, CShaderDefines()); } CShaderTechniquePtr CShaderManager::LoadEffect(CStrIntern name, const CShaderDefines& defines) { // Return the cached effect, if there is one EffectCacheKey key = { name, defines }; EffectCacheMap::iterator it = m_EffectCache.find(key); if (it != m_EffectCache.end()) return it->second; // First time we've seen this key, so construct a new effect: const VfsPath xmlFilename = L"shaders/effects/" + wstring_from_utf8(name.string()) + L".xml"; CShaderTechniquePtr tech = std::make_shared( xmlFilename, defines, PipelineStateDescCallback{}); if (!LoadTechnique(tech)) { LOGERROR("Failed to load effect '%s'", name.c_str()); tech = CShaderTechniquePtr(); } m_EffectCache[key] = tech; return tech; } CShaderTechniquePtr CShaderManager::LoadEffect( CStrIntern name, const CShaderDefines& defines, const PipelineStateDescCallback& callback) { // We don't cache techniques with callbacks. const VfsPath xmlFilename = L"shaders/effects/" + wstring_from_utf8(name.string()) + L".xml"; CShaderTechniquePtr technique = std::make_shared(xmlFilename, defines, callback); if (!LoadTechnique(technique)) { LOGERROR("Failed to load effect '%s'", name.c_str()); return {}; } return technique; } bool CShaderManager::LoadTechnique(CShaderTechniquePtr& tech) { PROFILE2("loading technique"); PROFILE2_ATTR("name: %s", tech->GetPath().string8().c_str()); AddTechniqueFileDependency(tech, tech->GetPath()); CXeromyces XeroFile; PSRETURN ret = XeroFile.Load(g_VFS, tech->GetPath()); if (ret != PSRETURN_OK) return false; // By default we assume that we have techinques for every dummy shader. if (m_Device->GetBackend() == Renderer::Backend::Backend::DUMMY) { CShaderProgramPtr shaderProgram = LoadProgram(str_dummy.string(), tech->GetShaderDefines()); std::vector techPasses; Renderer::Backend::SGraphicsPipelineStateDesc passPipelineStateDesc = Renderer::Backend::MakeDefaultGraphicsPipelineStateDesc(); passPipelineStateDesc.shaderProgram = shaderProgram->GetBackendShaderProgram(); techPasses.emplace_back( m_Device->CreateGraphicsPipelineState(passPipelineStateDesc), shaderProgram); tech->SetPasses(std::move(techPasses)); return true; } // Define all the elements and attributes used in the XML file #define EL(x) int el_##x = XeroFile.GetElementID(#x) #define AT(x) int at_##x = XeroFile.GetAttributeID(#x) EL(blend); EL(color); + EL(compute); EL(cull); EL(define); EL(depth); EL(pass); EL(polygon); EL(require); EL(sort_by_distance); EL(stencil); AT(compare); AT(constant); AT(context); AT(depth_fail); AT(dst); AT(fail); AT(front_face); AT(func); AT(mask); AT(mask_read); AT(mask_red); AT(mask_green); AT(mask_blue); AT(mask_alpha); AT(mode); AT(name); AT(op); AT(pass); AT(reference); AT(shader); AT(shaders); AT(src); AT(test); AT(value); #undef AT #undef EL // Prepare the preprocessor for conditional tests CPreprocessorWrapper preprocessor; preprocessor.AddDefines(tech->GetShaderDefines()); XMBElement root = XeroFile.GetRoot(); // Find all the techniques that we can use, and their preference std::optional usableTech; XERO_ITER_EL(root, technique) { bool isUsable = true; XERO_ITER_EL(technique, child) { XMBAttributeList attrs = child.GetAttributes(); // TODO: require should be an attribute of the tech and not its child. if (child.GetNodeName() == el_require) { if (attrs.GetNamedItem(at_shaders) == "arb") { if (m_Device->GetBackend() != Renderer::Backend::Backend::GL_ARB || !m_Device->GetCapabilities().ARBShaders) { isUsable = false; } } else if (attrs.GetNamedItem(at_shaders) == "glsl") { if (m_Device->GetBackend() != Renderer::Backend::Backend::GL) isUsable = false; } else if (attrs.GetNamedItem(at_shaders) == "spirv") { if (m_Device->GetBackend() != Renderer::Backend::Backend::VULKAN) isUsable = false; } else if (!attrs.GetNamedItem(at_context).empty()) { CStr cond = attrs.GetNamedItem(at_context); if (!preprocessor.TestConditional(cond)) isUsable = false; } } } if (isUsable) { usableTech.emplace(technique); break; } } if (!usableTech.has_value()) { debug_warn(L"Can't find a usable technique"); return false; } tech->SetSortByDistance(false); + const auto loadShaderProgramForTech = [&](const CStr& name, const CShaderDefines& defines) + { + CShaderProgramPtr shaderProgram = LoadProgram(name.c_str(), defines); + if (shaderProgram) + { + for (const VfsPath& shaderProgramPath : shaderProgram->GetFileDependencies()) + AddTechniqueFileDependency(tech, shaderProgramPath); + } + return shaderProgram; + }; + CShaderDefines techDefines = tech->GetShaderDefines(); XERO_ITER_EL((*usableTech), Child) { if (Child.GetNodeName() == el_define) { techDefines.Add(CStrIntern(Child.GetAttributes().GetNamedItem(at_name)), CStrIntern(Child.GetAttributes().GetNamedItem(at_value))); } else if (Child.GetNodeName() == el_sort_by_distance) { tech->SetSortByDistance(true); } } // We don't want to have a shader context depending on the order of define and // pass tags. // TODO: we might want to implement that in a proper way via splitting passes // and tags in different groups in XML. std::vector techPasses; XERO_ITER_EL((*usableTech), Child) { if (Child.GetNodeName() == el_pass) { CShaderDefines passDefines = techDefines; Renderer::Backend::SGraphicsPipelineStateDesc passPipelineStateDesc = Renderer::Backend::MakeDefaultGraphicsPipelineStateDesc(); XERO_ITER_EL(Child, Element) { if (Element.GetNodeName() == el_define) { passDefines.Add(CStrIntern(Element.GetAttributes().GetNamedItem(at_name)), CStrIntern(Element.GetAttributes().GetNamedItem(at_value))); } else if (Element.GetNodeName() == el_blend) { passPipelineStateDesc.blendState.enabled = true; passPipelineStateDesc.blendState.srcColorBlendFactor = passPipelineStateDesc.blendState.srcAlphaBlendFactor = Renderer::Backend::ParseBlendFactor(Element.GetAttributes().GetNamedItem(at_src)); passPipelineStateDesc.blendState.dstColorBlendFactor = passPipelineStateDesc.blendState.dstAlphaBlendFactor = Renderer::Backend::ParseBlendFactor(Element.GetAttributes().GetNamedItem(at_dst)); if (!Element.GetAttributes().GetNamedItem(at_op).empty()) { passPipelineStateDesc.blendState.colorBlendOp = passPipelineStateDesc.blendState.alphaBlendOp = Renderer::Backend::ParseBlendOp(Element.GetAttributes().GetNamedItem(at_op)); } if (!Element.GetAttributes().GetNamedItem(at_constant).empty()) { if (!passPipelineStateDesc.blendState.constant.ParseString( Element.GetAttributes().GetNamedItem(at_constant))) { LOGERROR("Failed to parse blend constant: %s", Element.GetAttributes().GetNamedItem(at_constant).c_str()); } } } else if (Element.GetNodeName() == el_color) { passPipelineStateDesc.blendState.colorWriteMask = 0; #define MASK_CHANNEL(ATTRIBUTE, VALUE) \ if (Element.GetAttributes().GetNamedItem(ATTRIBUTE) == "TRUE") \ passPipelineStateDesc.blendState.colorWriteMask |= Renderer::Backend::ColorWriteMask::VALUE MASK_CHANNEL(at_mask_red, RED); MASK_CHANNEL(at_mask_green, GREEN); MASK_CHANNEL(at_mask_blue, BLUE); MASK_CHANNEL(at_mask_alpha, ALPHA); #undef MASK_CHANNEL } else if (Element.GetNodeName() == el_cull) { if (!Element.GetAttributes().GetNamedItem(at_mode).empty()) { passPipelineStateDesc.rasterizationState.cullMode = Renderer::Backend::ParseCullMode(Element.GetAttributes().GetNamedItem(at_mode)); } if (!Element.GetAttributes().GetNamedItem(at_front_face).empty()) { passPipelineStateDesc.rasterizationState.frontFace = Renderer::Backend::ParseFrontFace(Element.GetAttributes().GetNamedItem(at_front_face)); } } else if (Element.GetNodeName() == el_depth) { if (!Element.GetAttributes().GetNamedItem(at_test).empty()) { passPipelineStateDesc.depthStencilState.depthTestEnabled = Element.GetAttributes().GetNamedItem(at_test) == "TRUE"; } if (!Element.GetAttributes().GetNamedItem(at_func).empty()) { passPipelineStateDesc.depthStencilState.depthCompareOp = Renderer::Backend::ParseCompareOp(Element.GetAttributes().GetNamedItem(at_func)); } if (!Element.GetAttributes().GetNamedItem(at_mask).empty()) { passPipelineStateDesc.depthStencilState.depthWriteEnabled = Element.GetAttributes().GetNamedItem(at_mask) == "true"; } } else if (Element.GetNodeName() == el_polygon) { if (!Element.GetAttributes().GetNamedItem(at_mode).empty()) { passPipelineStateDesc.rasterizationState.polygonMode = Renderer::Backend::ParsePolygonMode(Element.GetAttributes().GetNamedItem(at_mode)); } } else if (Element.GetNodeName() == el_stencil) { if (!Element.GetAttributes().GetNamedItem(at_test).empty()) { passPipelineStateDesc.depthStencilState.stencilTestEnabled = Element.GetAttributes().GetNamedItem(at_test) == "TRUE"; } if (!Element.GetAttributes().GetNamedItem(at_reference).empty()) { passPipelineStateDesc.depthStencilState.stencilReference = Element.GetAttributes().GetNamedItem(at_reference).ToULong(); } if (!Element.GetAttributes().GetNamedItem(at_mask_read).empty()) { passPipelineStateDesc.depthStencilState.stencilReadMask = Element.GetAttributes().GetNamedItem(at_mask_read).ToULong(); } if (!Element.GetAttributes().GetNamedItem(at_mask).empty()) { passPipelineStateDesc.depthStencilState.stencilWriteMask = Element.GetAttributes().GetNamedItem(at_mask).ToULong(); } if (!Element.GetAttributes().GetNamedItem(at_compare).empty()) { passPipelineStateDesc.depthStencilState.stencilFrontFace.compareOp = passPipelineStateDesc.depthStencilState.stencilBackFace.compareOp = Renderer::Backend::ParseCompareOp(Element.GetAttributes().GetNamedItem(at_compare)); } if (!Element.GetAttributes().GetNamedItem(at_fail).empty()) { passPipelineStateDesc.depthStencilState.stencilFrontFace.failOp = passPipelineStateDesc.depthStencilState.stencilBackFace.failOp = Renderer::Backend::ParseStencilOp(Element.GetAttributes().GetNamedItem(at_fail)); } if (!Element.GetAttributes().GetNamedItem(at_pass).empty()) { passPipelineStateDesc.depthStencilState.stencilFrontFace.passOp = passPipelineStateDesc.depthStencilState.stencilBackFace.passOp = Renderer::Backend::ParseStencilOp(Element.GetAttributes().GetNamedItem(at_pass)); } if (!Element.GetAttributes().GetNamedItem(at_depth_fail).empty()) { passPipelineStateDesc.depthStencilState.stencilFrontFace.depthFailOp = passPipelineStateDesc.depthStencilState.stencilBackFace.depthFailOp = Renderer::Backend::ParseStencilOp(Element.GetAttributes().GetNamedItem(at_depth_fail)); } } } // Load the shader program after we've read all the possibly-relevant s. CShaderProgramPtr shaderProgram = - LoadProgram(Child.GetAttributes().GetNamedItem(at_shader).c_str(), passDefines); + loadShaderProgramForTech(Child.GetAttributes().GetNamedItem(at_shader), passDefines); if (shaderProgram) { - for (const VfsPath& shaderProgramPath : shaderProgram->GetFileDependencies()) - AddTechniqueFileDependency(tech, shaderProgramPath); if (tech->GetPipelineStateDescCallback()) tech->GetPipelineStateDescCallback()(passPipelineStateDesc); passPipelineStateDesc.shaderProgram = shaderProgram->GetBackendShaderProgram(); techPasses.emplace_back( m_Device->CreateGraphicsPipelineState(passPipelineStateDesc), shaderProgram); } } + else if (Child.GetNodeName() == el_compute) + { + CShaderProgramPtr shaderProgram = + loadShaderProgramForTech(Child.GetAttributes().GetNamedItem(at_shader), techDefines); + if (shaderProgram) + { + Renderer::Backend::SComputePipelineStateDesc computePipelineStateDesc{}; + computePipelineStateDesc.shaderProgram = shaderProgram->GetBackendShaderProgram(); + tech->SetComputePipelineState( + m_Device->CreateComputePipelineState(computePipelineStateDesc), shaderProgram); + } + } } - tech->SetPasses(std::move(techPasses)); + if (!techPasses.empty()) + tech->SetPasses(std::move(techPasses)); return true; } size_t CShaderManager::GetNumEffectsLoaded() const { return m_EffectCache.size(); } /*static*/ Status CShaderManager::ReloadChangedFileCB(void* param, const VfsPath& path) { return static_cast(param)->ReloadChangedFile(path); } Status CShaderManager::ReloadChangedFile(const VfsPath& path) { // Find all shader programs using this file. const auto programs = m_HotloadPrograms.find(path); if (programs != m_HotloadPrograms.end()) { // Reload all shader programs using this file. for (const std::weak_ptr& ptr : programs->second) if (std::shared_ptr program = ptr.lock()) program->Reload(); } // Find all shader techinques using this file. We need to reload them after // shader programs. const auto techniques = m_HotloadTechniques.find(path); if (techniques != m_HotloadTechniques.end()) { // Reload all shader techinques using this file. for (const std::weak_ptr& ptr : techniques->second) if (std::shared_ptr technique = ptr.lock()) { if (!LoadTechnique(technique)) LOGERROR("Failed to reload technique '%s'", technique->GetPath().string8().c_str()); } } return INFO::OK; } void CShaderManager::AddTechniqueFileDependency(const CShaderTechniquePtr& technique, const VfsPath& path) { m_HotloadTechniques[path].insert(technique); } void CShaderManager::AddProgramFileDependency(const CShaderProgramPtr& program, const VfsPath& path) { m_HotloadPrograms[path].insert(program); } Index: ps/trunk/binaries/data/config/default.cfg =================================================================== --- ps/trunk/binaries/data/config/default.cfg (revision 28009) +++ ps/trunk/binaries/data/config/default.cfg (revision 28010) @@ -1,604 +1,607 @@ ; Global Configuration Settings ; ; ************************************************************** ; * DO NOT EDIT THIS FILE if you want personal customisations: * ; * create a text file called "local.cfg" instead, and copy * ; * the lines from this file that you want to change. * ; * * ; * If a setting is part of a section (for instance [hotkey]) * ; * you need to append the section name at the beginning of * ; * your custom line (for instance you need to write * ; * "hotkey.pause = Space" if you want to change the pausing * ; * hotkey to the spacebar). * ; * * ; * On Linux, create: * ; * $XDG_CONFIG_HOME/0ad/config/local.cfg * ; * (Note: $XDG_CONFIG_HOME defaults to ~/.config) * ; * * ; * On OS X, create: * ; * ~/Library/Application\ Support/0ad/config/local.cfg * ; * * ; * On Windows, create: * ; * %appdata%\0ad\config\local.cfg * ; * * ; ************************************************************** ; Enable/disable windowed mode by default. (Use Alt+Enter to toggle in the game.) windowed = false ; Switches between real fullscreen and borderless window on a full display size. borderless.fullscreen = true ; Hides a window border in the windowed mode. borderless.window = false ; Constrain mouse in the fullscreen mode to a window boundaries. window.mousegrabinfullscreen = true ; The same but for the window mode. window.mousegrabinwindowmode = false ; Show detailed tooltips (Unit stats) showdetailedtooltips = false ; Pause the game on window focus loss (Only applicable to single player mode) pauseonfocusloss = true ; Persist settings after leaving the game setup screen persistmatchsettings = true ; Default player name to use in multiplayer ; playername = "anonymous" ; Default server name or IP to use in multiplayer multiplayerserver = "127.0.0.1" ; Force a particular resolution. (If these are 0, the default is ; to keep the current desktop resolution in fullscreen mode or to ; use 1024x768 in windowed mode.) xres = 0 yres = 0 ; Force a non-standard bit depth (if 0 then use the current desktop bit depth) bpp = 0 ; Preferred display (for multidisplay setups, only works with SDL 2.0) display = 0 ; Enable Hi-DPI where supported, currently working only for testing. hidpi = false ; Allows to force GL version for SDL forceglversion = false forceglprofile = "compatibility" ; Possible values: compatibility, core, es forceglmajorversion = 3 forceglminorversion = 3 ; Big screenshot tiles screenshot.tiles = 8 screenshot.tilewidth = 480 screenshot.tileheight = 270 ; Emulate right-click with Ctrl+Click on Mac mice macmouse = false ; System settings: ; if false, actors won't be rendered but anything entity will be. renderactors = true watereffects=true ; When disabled, force usage of the fixed pipeline water. This is faster, but really, really ugly. waterfancyeffects = false waterrealdepth = true waterrefraction = true waterreflection = true shadows = true shadowquality = 0 ; Shadow map resolution. (-1 - Low, 0 - Medium, 1 - High, 2 - Very High) ; High values can crash the game when using a graphics card with low memory! shadowpcf = true ; Increases details closer to the camera but decreases performance ; especially on low hardware. shadowscascadecount = 1 shadowscascadedistanceratio = 1.7 ; Hides shadows after the distance. shadowscutoffdistance = 300.0 ; If true shadows cover the whole map instead of the camera frustum. shadowscovermap = false +renderer.scale = 1.0 +renderer.upscale.technique = "fsr" + vsync = false particles = true fog = true silhouettes = true showsky = true ; Uses a synchonized call to a GL driver to get an error state. Useful ; for a debugging of a system without GL_KHR_debug. gl.checkerrorafterswap = false ; Different ways to draw a cursor, possible values are "sdl" and "system". ; The "system" one doesn't support a visual change of the cursor. cursorbackend = "sdl" ; Backends for all graphics rendering: ; glarb - GL with legacy assembler-like shaders, might used only for buggy drivers. ; gl - GL with GLSL shaders, should be used by default. ; dummy - backend that does nothing, allows to check performance without backend drivers. ; vulkan - Vulkan with SPIR-V shaders. rendererbackend = "gl" ; Enables additional debug information in renderer backend. renderer.backend.debugcontext = false renderer.backend.debugmessages = false renderer.backend.debuglabels = false renderer.backend.debugscopedlabels = false renderer.backend.gl.enableframebufferinvalidating = false renderer.backend.vulkan.disabledescriptorindexing = false renderer.backend.vulkan.deviceindexoverride = -1 renderer.backend.vulkan.debugbarrierafterframebufferpass = false renderer.backend.vulkan.debugwaitidlebeforeacquire = false renderer.backend.vulkan.debugwaitidlebeforepresent = false renderer.backend.vulkan.debugwaitidleafterpresent = false ; Should not be edited. It's used only for preventing of running fixed pipeline. renderpath = default ; (0 - low, 1 - medium, 2 - high), higher quality means worse performance. textures.quality = 2 ; (1, 2, 4, 8 and 16) textures.maxanisotropy = 2 ;;;;; EXPERIMENTAL ;;;;; ; Experimental probably-non-working GPU skinning support; requires GLSL; use at own risk gpuskinning = false ; Use smooth LOS interpolation smoothlos = true ; Use screen-space postprocessing filters (HDR, bloom, DOF, etc). Incompatible with fixed renderpath. postproc = true ; Use anti-aliasing techniques. antialiasing = "disabled" ; Use sharpening techniques. sharpening = "disabled" sharpness = 0.3 ; Quality used for actors. max_actor_quality=200 ; Whether or not actor variants are selected randomly, possible values are "full", "limited", "none". variant_diversity = "full" ; Quality level of shader effects (set to 10 to display all effects) materialmgr.quality = 10.0 ;;;;;;;;;;;;;;;;;;;;;;;; [adaptivefps] session = 60 ; Throttle FPS in running games (prevents 100% CPU workload). menu = 60 ; Throttle FPS in menus only. [profiler2] server = "127.0.0.1" server.port = "8000" ; Use a free port on your machine. server.threads = "6" ; Enough for the browser's parallel connection limit [hotkey] ; Each one of the specified keys will trigger the action on the left ; for multiple-key combinations, separate keys with '+'. ; See keys.txt for the list of key names. ; > SYSTEM SETTINGS exit = "" ; 'Custom' exit to desktop, SDL handles the native command via SDL_Quit. cancel = Escape ; Close or cancel the current dialog box/popup confirm = Return ; Confirm the current command pause = Pause, "Shift+Space" ; Pause/unpause game screenshot = F2 ; Take PNG screenshot bigscreenshot = "Shift+F2" ; Take large BMP screenshot togglefullscreen = "Alt+Return" ; Toggle fullscreen/windowed mode screenshot.watermark = "Alt+K" ; Toggle product/company watermark for official screenshots wireframe = "Alt+Shift+W" ; Toggle wireframe mode silhouettes = "Alt+Shift+S" ; Toggle unit silhouettes ; > DIALOG HOTKEYS summary = "Ctrl+Tab" ; Toggle in-game summary lobby = "Alt+L" ; Show the multiplayer lobby in a dialog window. structree = "Alt+Shift+T" ; Show structure tree civinfo = "Alt+Shift+H" ; Show civilization info ; > CLIPBOARD CONTROLS copy = "Ctrl+C" ; Copy to clipboard paste = "Ctrl+V" ; Paste from clipboard cut = "Ctrl+X" ; Cut selected text and copy to the clipboard ; > CONSOLE SETTINGS console.toggle = BackQuote, F9 ; Open/close console ; > OVERLAY KEYS fps.toggle = "Alt+F" ; Toggle frame counter realtime.toggle = "Alt+T" ; Toggle current display of computer time timeelapsedcounter.toggle = "F12" ; Toggle time elapsed counter ceasefirecounter.toggle = "" ; Toggle ceasefire counter ; > HOTKEYS ONLY chat = Return ; Toggle chat window teamchat = "T" ; Toggle chat window in team chat mode privatechat = "L" ; Toggle chat window and select the previous private chat partner ; > QUICKSAVE quicksave = "Shift+F5" quickload = "Shift+F8" [hotkey.camera] reset = "R" ; Reset camera rotation to default. follow = "F" ; Follow the first unit in the selection rallypointfocus = "" ; Focus the camera on the rally point of the selected building lastattackfocus = "Space" ; Focus the camera on the last notified attack zoom.in = Plus, NumPlus ; Zoom camera in (continuous control) zoom.out = Minus, NumMinus ; Zoom camera out (continuous control) zoom.wheel.in = WheelUp ; Zoom camera in (stepped control) zoom.wheel.out = WheelDown ; Zoom camera out (stepped control) rotate.up = "Ctrl+UpArrow", "Ctrl+W" ; Rotate camera to look upwards rotate.down = "Ctrl+DownArrow", "Ctrl+S" ; Rotate camera to look downwards rotate.cw = "Ctrl+LeftArrow", "Ctrl+A", Q ; Rotate camera clockwise around terrain rotate.ccw = "Ctrl+RightArrow", "Ctrl+D", E ; Rotate camera anticlockwise around terrain rotate.wheel.cw = "Shift+WheelUp", MouseX1 ; Rotate camera clockwise around terrain (stepped control) rotate.wheel.ccw = "Shift+WheelDown", MouseX2 ; Rotate camera anticlockwise around terrain (stepped control) pan = MouseMiddle ; Enable scrolling by moving mouse left = A, LeftArrow ; Scroll or rotate left right = D, RightArrow ; Scroll or rotate right up = W, UpArrow ; Scroll or rotate up/forwards down = S, DownArrow ; Scroll or rotate down/backwards scroll.speed.increase = "Ctrl+Shift+S" ; Increase scroll speed scroll.speed.decrease = "Ctrl+Alt+S" ; Decrease scroll speed rotate.speed.increase = "Ctrl+Shift+R" ; Increase rotation speed rotate.speed.decrease = "Ctrl+Alt+R" ; Decrease rotation speed zoom.speed.increase = "Ctrl+Shift+Z" ; Increase zoom speed zoom.speed.decrease = "Ctrl+Alt+Z" ; Decrease zoom speed [hotkey.camera.jump] 1 = F5 ; Jump to position N 2 = F6 3 = F7 4 = F8 ;5 = ;6 = ;7 = ;8 = ;9 = ;10 = [hotkey.camera.jump.set] 1 = "Ctrl+F5" ; Set jump position N 2 = "Ctrl+F6" 3 = "Ctrl+F7" 4 = "Ctrl+F8" ;5 = ;6 = ;7 = ;8 = ;9 = ;10 = [hotkey.profile] toggle = "F11" ; Enable/disable real-time profiler save = "Shift+F11" ; Save current profiler data to logs/profile.txt [hotkey.profile2] toggle = "Ctrl+F11" ; Enable/disable HTTP/GPU modes for new profiler [hotkey.selection] cancel = Esc ; Un-select all units and cancel building placement add = Shift ; Add units to selection militaryonly = Alt ; Add only military units to the selection nonmilitaryonly = "Alt+Y" ; Add only non-military units to the selection idleonly = "I" ; Select only idle units woundedonly = "O" ; Select only wounded units remove = Ctrl ; Remove units from selection idlebuilder = Semicolon ; Select next idle builder idleworker = Period, NumDecimal ; Select next idle worker idlewarrior = Slash, NumDivide ; Select next idle warrior idleunit = BackSlash ; Select next idle unit offscreen = Alt ; Include offscreen units in selection singleselection = "" ; Modifier to select units individually, opposed to per formation. [hotkey.selection.group.add] 1 = "Shift+1", "Shift+Num1" 2 = "Shift+2", "Shift+Num2" 3 = "Shift+3", "Shift+Num3" 4 = "Shift+4", "Shift+Num4" 5 = "Shift+5", "Shift+Num5" 6 = "Shift+6", "Shift+Num6" 7 = "Shift+7", "Shift+Num7" 8 = "Shift+8", "Shift+Num8" 9 = "Shift+9", "Shift+Num9" 10 = "Shift+0", "Shift+Num0" [hotkey.selection.group.save] 1 = "Ctrl+1", "Ctrl+Num1" 2 = "Ctrl+2", "Ctrl+Num2" 3 = "Ctrl+3", "Ctrl+Num3" 4 = "Ctrl+4", "Ctrl+Num4" 5 = "Ctrl+5", "Ctrl+Num5" 6 = "Ctrl+6", "Ctrl+Num6" 7 = "Ctrl+7", "Ctrl+Num7" 8 = "Ctrl+8", "Ctrl+Num8" 9 = "Ctrl+9", "Ctrl+Num9" 10 = "Ctrl+0", "Ctrl+Num0" [hotkey.selection.group.select] 1 = 1, Num1 2 = 2, Num2 3 = 3, Num3 4 = 4, Num4 5 = 5, Num5 6 = 6, Num6 7 = 7, Num7 8 = 8, Num8 9 = 9, Num9 10 = 0, Num0 [hotkey.gamesetup] mapbrowser.open = "M" [hotkey.session] kill = Delete, Backspace ; Destroy selected units stop = "H" ; Stop the current action backtowork = "Y" ; The unit will go back to work unload = "U" ; Unload garrisoned units when a building/mechanical unit is selected unloadturrets = "U" ; Unload turreted units. leaveturret = "U" ; Leave turret point. move = "" ; Modifier to move to a point instead of another action (e.g. gather) capture = "C" ; Modifier to capture instead of another action (e.g. attack) attack = "" ; Modifier to attack instead of another action (e.g. capture) attackmove = Ctrl ; Modifier to attackmove when clicking on a point attackmoveUnit = "Ctrl+Q" ; Modifier to attackmove targeting only units when clicking on a point garrison = Ctrl ; Modifier to garrison when clicking on building occupyturret = Ctrl ; Modifier to occupy a turret when clicking on a turret holder. autorallypoint = Ctrl ; Modifier to set the rally point on the building itself guard = "G" ; Modifier to escort/guard when clicking on unit/building patrol = "P" ; Modifier to patrol a unit repair = "J" ; Modifier to repair when clicking on building/mechanical unit queue = Shift ; Modifier to queue unit orders instead of replacing pushorderfront = "" ; Modifier to push unit orders to the front instead of replacing. orderone = Alt ; Modifier to order only one entity in selection. batchtrain = Shift ; Modifier to train units in batches massbarter = Shift ; Modifier to barter bunch of resources masstribute = Shift ; Modifier to tribute bunch of resources noconfirmation = Shift ; Do not ask confirmation when deleting a building/unit fulltradeswap = Shift ; Modifier to put the desired trade resource to 100% unloadtype = Shift ; Modifier to unload all units of type deselectgroup = Ctrl ; Modifier to deselect units when clicking group icon, instead of selecting rotate.cw = RightBracket ; Rotate building placement preview clockwise rotate.ccw = LeftBracket ; Rotate building placement preview anticlockwise snaptoedges = Ctrl ; Modifier to align new structures with nearby existing structure toggledefaultformation = "" ; Switch between null default formation and the last default formation used (defaults to "box") flare = K ; Modifier to send a flare to your allies flareactivate = "" ; Modifier to activate the mode to send a flare to your allies calltoarms = "" ; Modifier to call the selected units to the arms. ; Overlays showstatusbars = Tab ; Toggle display of status bars devcommands.toggle = "Alt+D" ; Toggle developer commands panel highlightguarding = PageDown ; Toggle highlight of guarding units highlightguarded = PageUp ; Toggle highlight of guarded units diplomacycolors = "Alt+X" ; Toggle diplomacy colors toggleattackrange = "Alt+C" ; Toggle display of attack range overlays of selected defensive structures toggleaurasrange = "Alt+V" ; Toggle display of aura range overlays of selected units and structures togglehealrange = "Alt+B" ; Toggle display of heal range overlays of selected units [hotkey.session.gui] toggle = "Alt+G" ; Toggle visibility of session GUI menu.toggle = "F10" ; Toggle in-game menu diplomacy.toggle = "Ctrl+H" ; Toggle in-game diplomacy page barter.toggle = "Ctrl+B" ; Toggle in-game barter/trade page objectives.toggle = "Ctrl+O" ; Toggle in-game objectives page tutorial.toggle = "Ctrl+P" ; Toggle in-game tutorial panel [hotkey.session.savedgames] delete = Delete, Backspace ; Delete the selected saved game asking confirmation noconfirmation = Shift ; Do not ask confirmation when deleting a game [hotkey.session.queueunit] ; > UNIT TRAINING 1 = "Z" ; add first unit type to queue 2 = "X" ; add second unit type to queue 3 = "C" ; add third unit type to queue 4 = "V" ; add fourth unit type to queue 5 = "B" ; add fivth unit type to queue 6 = "N" ; add sixth unit type to queue 7 = "M" ; add seventh unit type to queue 8 = Comma ; add eighth unit type to queue [hotkey.session.timewarp] fastforward = "Ctrl+Space" ; If timewarp mode enabled, speed up the game rewind = "Shift+Backspace" ; If timewarp mode enabled, go back to earlier point in the game [hotkey.tab] next = "Tab", "Alt+S" ; Show the next tab prev = "Shift+Tab", "Alt+W" ; Show the previous tab [hotkey.text] ; > GUI TEXTBOX HOTKEYS delete.left = "Ctrl+Backspace" ; Delete word to the left of cursor delete.right = "Ctrl+Del" ; Delete word to the right of cursor move.left = "Ctrl+LeftArrow" ; Move cursor to start of word to the left of cursor move.right = "Ctrl+RightArrow" ; Move cursor to start of word to the right of cursor [gui] cursorblinkrate = 0.5 ; Cursor blink rate in seconds (0.0 to disable blinking) scale = 1.0 ; GUI scaling factor, for improved compatibility with 4K displays [gui.gamesetup] enabletips = true ; Enable/Disable tips during gamesetup (for newcomers) assignplayers = everyone ; Whether to assign joining clients to free playerslots. Possible values: everyone, buddies, disabled. aidifficulty = 3 ; Difficulty level, from 0 (easiest) to 5 (hardest) aibehavior = "random" ; Default behavior of the AI (random, balanced, aggressive or defensive) settingsslide = true ; Enable/Disable settings panel slide [gui.loadingscreen] progressdescription = false ; Whether to display the progress percent or a textual description [gui.session] dragdelta = 4 ; Number of pixels the mouse can move before the action is considered a drag camerajump.threshold = 40 ; How close do we have to be to the actual location in order to jump back to the previous one? timeelapsedcounter = false ; Show the game duration in the top right corner ceasefirecounter = false ; Show the remaining ceasefire time in the top right corner batchtrainingsize = 5 ; Number of units to be trained per batch by default (when pressing the hotkey) scrollbatchratio = 1 ; Number of times you have to scroll to increase/decrease the batchsize by 1 flarelifetime = 6 ; How long the flare markers on the minimap are displayed in seconds woundedunithotkeythreshold = 33 ; The wounded unit hotkey considers the selected units as wounded if their health percentage falls below this number attackrange = true ; Display attack range overlays of selected defensive structures aurasrange = true ; Display aura range overlays of selected units and structures healrange = true ; Display heal range overlays of selected units rankabovestatusbar = true ; Show rank icons above status bars experiencestatusbar = true ; Show an experience status bar above each selected unit respoptooltipsort = 0 ; Sorting players in the resources and population tooltip by value (0 - no sort, -1 - ascending, 1 - descending) snaptoedges = "disabled" ; Possible values: disabled, enabled. snaptoedgesdistancethreshold = 15 ; On which distance we don't snap to edges disjointcontrolgroups = "true" ; Whether control groups are disjoint sets or entities can be in multiple control groups at the same time. defaultformation = "special/formations/box" ; For walking orders, automatically put units into this formation if they don't have one already. formationwalkonly = "true" ; Formations are disabled when giving gather/attack/... orders. howtoshownames = 0 ; Whether the specific names are show as default, as opposed to the generic names. And whether the secondary names are shown. (0 - show both; specific names primary, 1 - show both; generic names primary, 2 - show only specific names, 3 - show only generic names) selectformationasone = "true" ; Whether to select formations as a whole by default. [gui.session.minimap] ; Icons that are displayed for some entities on a minimap. icons.enabled = "true" icons.opacity = 1.0 icons.sizescale = 1.0 blinkduration = 1.7 ; The blink duration while pinging pingduration = 50.0 ; The duration for which an entity will be pinged after an attack notification [gui.session.notifications] attack = true ; Show a chat notification if you are attacked by another player tribute = true ; Show a chat notification if an ally tributes resources to another team member if teams are locked, and all tributes in observer mode barter = true ; Show a chat notification to observers when a player bartered resources phase = completed ; Show a chat notification if you or an ally have started, aborted or completed a new phase, and phases of all players in observer mode. Possible values: none, completed, all. [gui.splashscreen] enable = true ; Enable/disable the splashscreen version = 0 ; Splashscreen version (date of last modification). By default, 0 to force splashscreen to appear at first launch [gui.session.diplomacycolors] self = "21 55 149" ; Color of your units when diplomacy colors are enabled ally = "86 180 31" ; Color of allies when diplomacy colors are enabled neutral = "231 200 5" ; Color of neutral players when diplomacy colors are enabled enemy = "150 20 20" ; Color of enemies when diplomacy colors are enabled [joystick] ; EXPERIMENTAL: joystick/gamepad settings enable = false deadzone = 8192 [chat] timestamp = true ; Show at which time chat messages have been sent [chat.session] extended = true ; Whether to display the chat history [lobby] history = 0 ; Number of past messages to display on join room = "arena27" ; Default MUC room to join server = "lobby.wildfiregames.com" ; Address of lobby server tls = true ; Whether to use TLS encryption when connecting to the server. verify_certificate = false ; Whether to reject connecting to the lobby if the TLS certificate is invalid (TODO: wait for Gloox GnuTLS trust implementation to be fixed) terms_url = "https://trac.wildfiregames.com/browser/ps/trunk/binaries/data/mods/public/gui/prelobby/common/terms/"; Allows the user to save the text and print the terms terms_of_service = "0" ; Version (hash) of the Terms of Service that the user has accepted terms_of_use = "0" ; Version (hash) of the Terms of Use that the user has accepted privacy_policy = "0" ; Version (hash) of the Privacy Policy that the user has accepted xpartamupp = "wfgbot27" ; Name of the server-side XMPP-account that manage games echelon = "echelon27" ; Name of the server-side XMPP-account that manages ratings buddies = "," ; Comma separated list of playernames that the current user has marked as buddies rememberpassword = true ; Whether to store the encrypted password in the user config [lobby.columns] gamerating = false ; Show the average rating of the participating players in a column of the gamelist [lobby.stun] enabled = true ; The STUN protocol allows hosting games without configuring the firewall and router. ; If STUN is disabled, the game relies on direct connection, UPnP and port forwarding. server = "lobby.wildfiregames.com" ; Address of the STUN server. port = 3478 ; Port of the STUN server. delay = 200 ; Duration in milliseconds that is waited between STUN messages. ; Smaller numbers speed up joins but also become less stable. [mod] enabledmods = "mod public" [modio] public_key = "RWQv2alKl8D0zMDJR766jpYvPy4u3y77HL/iKb/lsT1Fnf6ezoMb2x8+" ; Public key corresponding to the private key valid mods are signed with disclaimer = "0" ; Version (hash) of the Disclaimer that the user has accepted [modio.v1] baseurl = "https://api.mod.io/v1" api_key = "23df258a71711ea6e4b50893acc1ba55" name_id = "0ad" [network] duplicateplayernames = false ; Rename joining player to "User (2)" if "User" is already connected, otherwise prohibit join. lateobservers = everyone ; Allow observers to join the game after it started. Possible values: everyone, buddies, disabled. observerlimit = 8 ; Prevent further observer joins in running games if this limit is reached observermaxlag = -1 ; Make clients wait for observers if they lag more than X turns behind. -1 means "never wait for observers". autocatchup = true ; Auto-accelerate the sim rate if lagging behind (as an observer). enetmtu = 1372 ; Lower ENet protocol MTU in case packets get further fragmented on the UDP layer which may cause drops. [overlay] fps = "false" ; Show frames per second in top right corner realtime = "false" ; Show current system time in top right corner netwarnings = "true" ; Show warnings if the network connection is bad [profiler2] autoenable = false ; Enable HTTP server output at startup (default off for security/performance) gpu.arb.enable = true ; Allow GL_ARB_timer_query timing mode when available. [rlinterface] address = "127.0.0.1:6000" [sound] mastergain = 0.9 musicgain = 0.2 ambientgain = 0.6 actiongain = 0.7 uigain = 0.7 mindistance = 1 maxdistance = 350 maxstereoangle = 0.62 ; About PI/5 radians [sound.notify] nick = true ; Play a sound when someone mentions your name in the lobby or game gamesetup.join = false ; Play a sound when a new client joins the game setup [tinygettext] debug = false ; Print error messages each time a translation for an English string is not found. [userreport] ; Opt-in online user reporting system url_upload = "https://feedback.wildfiregames.com/report/upload/v1/" ; URL where UserReports are uploaded to url_publication = "https://feedback.wildfiregames.com/" ; URL where UserReports were analyzed and published url_terms = "https://trac.wildfiregames.com/browser/ps/trunk/binaries/data/mods/public/gui/userreport/Terms_and_Conditions.txt"; Allows the user to save the text and print the terms terms = "0" ; Version (hash) of the UserReporter Terms that the user has accepted [view] ; Camera control settings scroll.speed = 120.0 scroll.speed.modifier = 1.05 ; Multiplier for changing scroll speed scroll.mouse.detectdistance = 3 rotate.x.speed = 1.2 rotate.x.min = 28.0 rotate.x.max = 60.0 rotate.x.default = 35.0 rotate.y.speed = 2.0 rotate.y.speed.wheel = 0.45 rotate.y.default = 0.0 rotate.speed.modifier = 1.05 ; Multiplier for changing rotation speed drag.speed = 0.5 zoom.speed = 256.0 zoom.speed.wheel = 32.0 zoom.min = 50.0 zoom.max = 200.0 zoom.default = 120.0 zoom.speed.modifier = 1.05 ; Multiplier for changing zoom speed pos.smoothness = 0.1 zoom.smoothness = 0.4 rotate.x.smoothness = 0.5 rotate.y.smoothness = 0.3 near = 2.0 ; Near plane distance far = 4096.0 ; Far plane distance fov = 45.0 ; Field of view (degrees), lower is narrow, higher is wide height.smoothness = 0.5 height.min = 16 Index: ps/trunk/binaries/data/mods/mod/shaders/effects/compute_downscale.xml =================================================================== --- ps/trunk/binaries/data/mods/mod/shaders/effects/compute_downscale.xml (nonexistent) +++ ps/trunk/binaries/data/mods/mod/shaders/effects/compute_downscale.xml (revision 28010) @@ -0,0 +1,11 @@ + + + + + + + + + + + Property changes on: ps/trunk/binaries/data/mods/mod/shaders/effects/compute_downscale.xml ___________________________________________________________________ Added: svn:eol-style ## -0,0 +1 ## +native \ No newline at end of property Index: ps/trunk/binaries/data/mods/mod/shaders/effects/compute_rcas.xml =================================================================== --- ps/trunk/binaries/data/mods/mod/shaders/effects/compute_rcas.xml (nonexistent) +++ ps/trunk/binaries/data/mods/mod/shaders/effects/compute_rcas.xml (revision 28010) @@ -0,0 +1,11 @@ + + + + + + + + + + + Property changes on: ps/trunk/binaries/data/mods/mod/shaders/effects/compute_rcas.xml ___________________________________________________________________ Added: svn:eol-style ## -0,0 +1 ## +native \ No newline at end of property Index: ps/trunk/binaries/data/mods/mod/shaders/effects/compute_upscale_fsr.xml =================================================================== --- ps/trunk/binaries/data/mods/mod/shaders/effects/compute_upscale_fsr.xml (nonexistent) +++ ps/trunk/binaries/data/mods/mod/shaders/effects/compute_upscale_fsr.xml (revision 28010) @@ -0,0 +1,11 @@ + + + + + + + + + + + Property changes on: ps/trunk/binaries/data/mods/mod/shaders/effects/compute_upscale_fsr.xml ___________________________________________________________________ Added: svn:eol-style ## -0,0 +1 ## +native \ No newline at end of property Index: ps/trunk/binaries/data/mods/mod/shaders/effects/upscale_bilinear.xml =================================================================== --- ps/trunk/binaries/data/mods/mod/shaders/effects/upscale_bilinear.xml (nonexistent) +++ ps/trunk/binaries/data/mods/mod/shaders/effects/upscale_bilinear.xml (revision 28010) @@ -0,0 +1,15 @@ + + + + + + + + + + + + + + + Property changes on: ps/trunk/binaries/data/mods/mod/shaders/effects/upscale_bilinear.xml ___________________________________________________________________ Added: svn:eol-style ## -0,0 +1 ## +native \ No newline at end of property Index: ps/trunk/binaries/data/mods/mod/shaders/effects/upscale_nearest.xml =================================================================== --- ps/trunk/binaries/data/mods/mod/shaders/effects/upscale_nearest.xml (nonexistent) +++ ps/trunk/binaries/data/mods/mod/shaders/effects/upscale_nearest.xml (revision 28010) @@ -0,0 +1,15 @@ + + + + + + + + + + + + + + + Property changes on: ps/trunk/binaries/data/mods/mod/shaders/effects/upscale_nearest.xml ___________________________________________________________________ Added: svn:eol-style ## -0,0 +1 ## +native \ No newline at end of property Index: ps/trunk/binaries/data/mods/mod/shaders/glsl/common/compute.h =================================================================== --- ps/trunk/binaries/data/mods/mod/shaders/glsl/common/compute.h (nonexistent) +++ ps/trunk/binaries/data/mods/mod/shaders/glsl/common/compute.h (revision 28010) @@ -0,0 +1,22 @@ +#ifndef INCLUDED_COMMON_COMPUTE +#define INCLUDED_COMMON_COMPUTE + +#include "common/descriptor_indexing.h" +#include "common/texture.h" +#include "common/uniform.h" + +#if STAGE_COMPUTE + +#if USE_SPIRV +#define STORAGE_2D(LOCATION, FORMAT, NAME) \ + layout(set = 2, binding = LOCATION, FORMAT) uniform image2D NAME +#else +// We use offset to the binding slot for OpenGL to avoid overlapping with other +// textures as OpenGL doesn't have sets. +#define STORAGE_2D(LOCATION, FORMAT, NAME) \ + layout(binding = LOCATION, FORMAT) uniform image2D NAME +#endif + +#endif // STAGE_COMPUTE + +#endif // INCLUDED_COMMON_COMPUTE Property changes on: ps/trunk/binaries/data/mods/mod/shaders/glsl/common/compute.h ___________________________________________________________________ Added: svn:eol-style ## -0,0 +1 ## +native \ No newline at end of property Index: ps/trunk/binaries/data/mods/mod/shaders/glsl/common/descriptor_indexing.h =================================================================== --- ps/trunk/binaries/data/mods/mod/shaders/glsl/common/descriptor_indexing.h (nonexistent) +++ ps/trunk/binaries/data/mods/mod/shaders/glsl/common/descriptor_indexing.h (revision 28010) @@ -0,0 +1,12 @@ +#ifndef INCLUDED_COMMON_DESCRIPTOR_INDEXING +#define INCLUDED_COMMON_DESCRIPTOR_INDEXING + +#if USE_SPIRV && USE_DESCRIPTOR_INDEXING +#extension GL_EXT_nonuniform_qualifier : enable +const int DESCRIPTOR_INDEXING_SET_SIZE = 16384; +layout (set = 0, binding = 0) uniform sampler2D textures2D[DESCRIPTOR_INDEXING_SET_SIZE]; +layout (set = 0, binding = 1) uniform samplerCube texturesCube[DESCRIPTOR_INDEXING_SET_SIZE]; +layout (set = 0, binding = 2) uniform sampler2DShadow texturesShadow[DESCRIPTOR_INDEXING_SET_SIZE]; +#endif // USE_SPIRV && USE_DESCRIPTOR_INDEXING + +#endif // INCLUDED_COMMON_DESCRIPTOR_INDEXING Property changes on: ps/trunk/binaries/data/mods/mod/shaders/glsl/common/descriptor_indexing.h ___________________________________________________________________ Added: svn:eol-style ## -0,0 +1 ## +native \ No newline at end of property Index: ps/trunk/binaries/data/mods/mod/shaders/glsl/common/fragment.h =================================================================== --- ps/trunk/binaries/data/mods/mod/shaders/glsl/common/fragment.h (revision 28009) +++ ps/trunk/binaries/data/mods/mod/shaders/glsl/common/fragment.h (revision 28010) @@ -1,35 +1,28 @@ #ifndef INCLUDED_COMMON_FRAGMENT #define INCLUDED_COMMON_FRAGMENT +#include "common/descriptor_indexing.h" #include "common/texture.h" #include "common/uniform.h" #if USE_SPIRV -#if USE_DESCRIPTOR_INDEXING -#extension GL_EXT_nonuniform_qualifier : enable -const int DESCRIPTOR_INDEXING_SET_SIZE = 16384; -layout (set = 0, binding = 0) uniform sampler2D textures2D[DESCRIPTOR_INDEXING_SET_SIZE]; -layout (set = 0, binding = 1) uniform samplerCube texturesCube[DESCRIPTOR_INDEXING_SET_SIZE]; -layout (set = 0, binding = 2) uniform sampler2DShadow texturesShadow[DESCRIPTOR_INDEXING_SET_SIZE]; -#endif // USE_DESCRIPTOR_INDEXING - layout (location = 0) out vec4 fragmentColor; #define OUTPUT_FRAGMENT_SINGLE_COLOR(COLOR) \ fragmentColor = COLOR #define OUTPUT_FRAGMENT_COLOR(LOCATION, COLOR) \ gl_FragData[LOCATION] = COLOR #else // USE_SPIRV #define OUTPUT_FRAGMENT_SINGLE_COLOR(COLOR) \ gl_FragColor = COLOR #define OUTPUT_FRAGMENT_COLOR(LOCATION, COLOR) \ gl_FragData[LOCATION] = COLOR #endif // USE_SPIRV #endif // INCLUDED_COMMON_FRAGMENT Index: ps/trunk/binaries/data/mods/mod/shaders/glsl/common/uniform.h =================================================================== --- ps/trunk/binaries/data/mods/mod/shaders/glsl/common/uniform.h (revision 28009) +++ ps/trunk/binaries/data/mods/mod/shaders/glsl/common/uniform.h (revision 28010) @@ -1,93 +1,93 @@ #ifndef INCLUDED_COMMON_UNIFORM #define INCLUDED_COMMON_UNIFORM #if USE_SPIRV #if USE_DESCRIPTOR_INDEXING #define BEGIN_DRAW_TEXTURES struct DrawTextures { #define END_DRAW_TEXTURES }; #define NO_DRAW_TEXTURES uint padding; // We can't have empty struct in GLSL. #define TEXTURE_2D(LOCATION, NAME) uint NAME; #define TEXTURE_2D_SHADOW(LOCATION, NAME) uint NAME; #define TEXTURE_CUBE(LOCATION, NAME) uint NAME; #define GET_DRAW_TEXTURE_2D(NAME) \ textures2D[drawTextures.NAME] #define GET_DRAW_TEXTURE_2D_SHADOW(NAME) \ texturesShadow[drawTextures.NAME] #define GET_DRAW_TEXTURE_CUBE(NAME) \ texturesCube[drawTextures.NAME] #else // USE_DESCRIPTOR_INDEXING #define BEGIN_DRAW_TEXTURES #define END_DRAW_TEXTURES #define NO_DRAW_TEXTURES -#if STAGE_FRAGMENT +#if STAGE_FRAGMENT || STAGE_COMPUTE #define TEXTURE_2D(LOCATION, NAME) \ layout (set = 1, binding = LOCATION) uniform sampler2D NAME; #define TEXTURE_2D_SHADOW(LOCATION, NAME) \ layout (set = 1, binding = LOCATION) uniform sampler2DShadow NAME; #define TEXTURE_CUBE(LOCATION, NAME) \ layout (set = 1, binding = LOCATION) uniform samplerCube NAME; #else #define TEXTURE_2D(LOCATION, NAME) #define TEXTURE_2D_SHADOW(LOCATION, NAME) #define TEXTURE_CUBE(LOCATION, NAME) #endif #define GET_DRAW_TEXTURE_2D(NAME) NAME #define GET_DRAW_TEXTURE_2D_SHADOW(NAME) NAME #define GET_DRAW_TEXTURE_CUBE(NAME) NAME #endif // USE_DESCRIPTOR_INDEXING #if USE_DESCRIPTOR_INDEXING #define BEGIN_DRAW_UNIFORMS layout (push_constant) uniform DrawUniforms { #define END_DRAW_UNIFORMS DrawTextures drawTextures; }; #define BEGIN_MATERIAL_UNIFORMS layout (std140, set = 1, binding = 0) uniform MaterialUniforms { #define END_MATERIAL_UNIFORMS }; #else #define BEGIN_DRAW_UNIFORMS layout (push_constant) uniform DrawUniforms { #define END_DRAW_UNIFORMS }; #define BEGIN_MATERIAL_UNIFORMS layout (std140, set = 0, binding = 0) uniform MaterialUniforms { #define END_MATERIAL_UNIFORMS }; #endif #define UNIFORM(TYPE, NAME) \ TYPE NAME; #else // USE_SPIRV #define BEGIN_DRAW_TEXTURES #define END_DRAW_TEXTURES #define NO_DRAW_TEXTURES -#if STAGE_FRAGMENT +#if STAGE_FRAGMENT || STAGE_COMPUTE #define TEXTURE_2D(LOCATION, NAME) \ uniform sampler2D NAME; #define TEXTURE_2D_SHADOW(LOCATION, NAME) \ uniform sampler2DShadow NAME; #define TEXTURE_CUBE(LOCATION, NAME) \ uniform samplerCube NAME; #else #define TEXTURE_2D(LOCATION, NAME) #define TEXTURE_2D_SHADOW(LOCATION, NAME) #define TEXTURE_CUBE(LOCATION, NAME) #endif #define GET_DRAW_TEXTURE_2D(NAME) \ NAME #define GET_DRAW_TEXTURE_2D_SHADOW(NAME) \ NAME #define GET_DRAW_TEXTURE_CUBE(NAME) \ NAME #define BEGIN_DRAW_UNIFORMS #define END_DRAW_UNIFORMS #define BEGIN_MATERIAL_UNIFORMS #define END_MATERIAL_UNIFORMS #define UNIFORM(TYPE, NAME) \ uniform TYPE NAME; #endif // USE_SPIRV #endif // INCLUDED_COMMON_UNIFORM Index: ps/trunk/binaries/data/mods/mod/shaders/glsl/compute_downscale.cs =================================================================== --- ps/trunk/binaries/data/mods/mod/shaders/glsl/compute_downscale.cs (nonexistent) +++ ps/trunk/binaries/data/mods/mod/shaders/glsl/compute_downscale.cs (revision 28010) @@ -0,0 +1,23 @@ +#version 430 + +#include "common/compute.h" + +BEGIN_DRAW_TEXTURES + TEXTURE_2D(0, inTex) +END_DRAW_TEXTURES + +BEGIN_DRAW_UNIFORMS + UNIFORM(vec4, screenSize) +END_DRAW_UNIFORMS + +STORAGE_2D(0, rgba8, outTex); + +layout(local_size_x = 8, local_size_y = 8, local_size_z = 1) in; +void main() +{ + ivec2 position = ivec2(gl_GlobalInvocationID.xy); + if (any(greaterThanEqual(position, ivec2(screenSize.zw)))) + return; + vec2 uv = (vec2(position) + vec2(0.5, 0.5)) / screenSize.zw; + imageStore(outTex, position, texture(GET_DRAW_TEXTURE_2D(inTex), uv)); +} Property changes on: ps/trunk/binaries/data/mods/mod/shaders/glsl/compute_downscale.cs ___________________________________________________________________ Added: svn:eol-style ## -0,0 +1 ## +native \ No newline at end of property Index: ps/trunk/binaries/data/mods/mod/shaders/glsl/compute_downscale.xml =================================================================== --- ps/trunk/binaries/data/mods/mod/shaders/glsl/compute_downscale.xml (nonexistent) +++ ps/trunk/binaries/data/mods/mod/shaders/glsl/compute_downscale.xml (revision 28010) @@ -0,0 +1,6 @@ + + + + + + Property changes on: ps/trunk/binaries/data/mods/mod/shaders/glsl/compute_downscale.xml ___________________________________________________________________ Added: svn:eol-style ## -0,0 +1 ## +native \ No newline at end of property Index: ps/trunk/binaries/data/mods/mod/shaders/glsl/compute_rcas.cs =================================================================== --- ps/trunk/binaries/data/mods/mod/shaders/glsl/compute_rcas.cs (nonexistent) +++ ps/trunk/binaries/data/mods/mod/shaders/glsl/compute_rcas.cs (revision 28010) @@ -0,0 +1,50 @@ +#version 430 + +#include "common/compute.h" + +BEGIN_DRAW_TEXTURES + TEXTURE_2D(0, inTex) +END_DRAW_TEXTURES + +BEGIN_DRAW_UNIFORMS + UNIFORM(float, sharpness) +END_DRAW_UNIFORMS + +STORAGE_2D(0, rgba8, outTex); + +#define A_GPU 1 +#define A_GLSL 1 +#define FSR_RCAS_DENOISE 1 + +// TODO: support 16-bit floats. +#include "ffx_a.h" + +#define FSR_RCAS_F 1 +AF4 FsrRcasLoadF(ASU2 p) { return texelFetch(GET_DRAW_TEXTURE_2D(inTex), ASU2(p), 0); } +void FsrRcasInputF(inout AF1 r, inout AF1 g, inout AF1 b) {} + +#include "ffx_fsr1.h" + +void CurrFilter(AU2 pos) +{ + AU4 const0; + FsrRcasCon(const0, sharpness); + + AF3 c; + FsrRcasF(c.r, c.g, c.b, pos, const0); + imageStore(outTex, ASU2(pos), AF4(c, 1)); +} + +layout(local_size_x = 64, local_size_y = 1, local_size_z = 1) in; +void main() +{ + // Do remapping of local xy in workgroup for a more PS-like swizzle pattern. + AU2 gxy = ARmp8x8(gl_LocalInvocationID.x) + AU2(gl_WorkGroupID.x << 4u, gl_WorkGroupID.y << 4u); + CurrFilter(gxy); + gxy.x += 8u; + CurrFilter(gxy); + gxy.y += 8u; + CurrFilter(gxy); + gxy.x -= 8u; + CurrFilter(gxy); +} Property changes on: ps/trunk/binaries/data/mods/mod/shaders/glsl/compute_rcas.cs ___________________________________________________________________ Added: svn:eol-style ## -0,0 +1 ## +native \ No newline at end of property Index: ps/trunk/binaries/data/mods/mod/shaders/glsl/compute_rcas.xml =================================================================== --- ps/trunk/binaries/data/mods/mod/shaders/glsl/compute_rcas.xml (nonexistent) +++ ps/trunk/binaries/data/mods/mod/shaders/glsl/compute_rcas.xml (revision 28010) @@ -0,0 +1,6 @@ + + + + + + Property changes on: ps/trunk/binaries/data/mods/mod/shaders/glsl/compute_rcas.xml ___________________________________________________________________ Added: svn:eol-style ## -0,0 +1 ## +native \ No newline at end of property Index: ps/trunk/binaries/data/mods/mod/shaders/glsl/compute_upscale_fsr.cs =================================================================== --- ps/trunk/binaries/data/mods/mod/shaders/glsl/compute_upscale_fsr.cs (nonexistent) +++ ps/trunk/binaries/data/mods/mod/shaders/glsl/compute_upscale_fsr.cs (revision 28010) @@ -0,0 +1,54 @@ +#version 430 + +#include "common/compute.h" + +BEGIN_DRAW_TEXTURES + TEXTURE_2D(0, inTex) +END_DRAW_TEXTURES + +BEGIN_DRAW_UNIFORMS + UNIFORM(vec4, screenSize) +END_DRAW_UNIFORMS + +STORAGE_2D(0, rgba8, outTex); + +#define A_GPU 1 +#define A_GLSL 1 + +// TODO: support 16-bit floats. +#include "ffx_a.h" + +#define FSR_EASU_F 1 +AF4 FsrEasuRF(AF2 p) { AF4 res = textureGather(GET_DRAW_TEXTURE_2D(inTex), p, 0); return res; } +AF4 FsrEasuGF(AF2 p) { AF4 res = textureGather(GET_DRAW_TEXTURE_2D(inTex), p, 1); return res; } +AF4 FsrEasuBF(AF2 p) { AF4 res = textureGather(GET_DRAW_TEXTURE_2D(inTex), p, 2); return res; } + +#include "ffx_fsr1.h" + +void CurrFilter(AU2 pos) +{ + uvec4 const0, const1, const2, const3; + FsrEasuCon( + const0, const1, const2, const3, + screenSize.x, screenSize.y, + screenSize.x, screenSize.y, + screenSize.z, screenSize.w); + + AF3 c; + FsrEasuF(c, pos, const0, const1, const2, const3); + imageStore(outTex, ASU2(pos), AF4(c, 1)); +} + +layout(local_size_x = 64, local_size_y = 1, local_size_z = 1) in; +void main() +{ + // Do remapping of local xy in workgroup for a more PS-like swizzle pattern. + AU2 gxy = ARmp8x8(gl_LocalInvocationID.x) + AU2(gl_WorkGroupID.x << 4u, gl_WorkGroupID.y << 4u); + CurrFilter(gxy); + gxy.x += 8u; + CurrFilter(gxy); + gxy.y += 8u; + CurrFilter(gxy); + gxy.x -= 8u; + CurrFilter(gxy); +} Property changes on: ps/trunk/binaries/data/mods/mod/shaders/glsl/compute_upscale_fsr.cs ___________________________________________________________________ Added: svn:eol-style ## -0,0 +1 ## +native \ No newline at end of property Index: ps/trunk/binaries/data/mods/mod/shaders/glsl/compute_upscale_fsr.xml =================================================================== --- ps/trunk/binaries/data/mods/mod/shaders/glsl/compute_upscale_fsr.xml (nonexistent) +++ ps/trunk/binaries/data/mods/mod/shaders/glsl/compute_upscale_fsr.xml (revision 28010) @@ -0,0 +1,6 @@ + + + + + + Property changes on: ps/trunk/binaries/data/mods/mod/shaders/glsl/compute_upscale_fsr.xml ___________________________________________________________________ Added: svn:eol-style ## -0,0 +1 ## +native \ No newline at end of property Index: ps/trunk/source/renderer/backend/PipelineState.h =================================================================== --- ps/trunk/source/renderer/backend/PipelineState.h (revision 28009) +++ ps/trunk/source/renderer/backend/PipelineState.h (revision 28010) @@ -1,200 +1,216 @@ -/* Copyright (C) 2022 Wildfire Games. +/* Copyright (C) 2024 Wildfire Games. * This file is part of 0 A.D. * * 0 A.D. is free software: you can redistribute it and/or modify * it under the terms of the GNU General Public License as published by * the Free Software Foundation, either version 2 of the License, or * (at your option) any later version. * * 0 A.D. is distributed in the hope that it will be useful, * but WITHOUT ANY WARRANTY; without even the implied warranty of * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the * GNU General Public License for more details. * * You should have received a copy of the GNU General Public License * along with 0 A.D. If not, see . */ #ifndef INCLUDED_RENDERER_BACKEND_PIPELINESTATE #define INCLUDED_RENDERER_BACKEND_PIPELINESTATE #include "graphics/Color.h" #include "renderer/backend/CompareOp.h" #include "renderer/backend/IDeviceObject.h" #include "renderer/backend/IShaderProgram.h" class CStr; namespace Renderer { namespace Backend { enum class StencilOp { // Keeps the current value. KEEP, // Sets the value to zero. ZERO, // Sets the value to reference. REPLACE, // Increments the value and clamps to the maximum representable unsigned // value. INCREMENT_AND_CLAMP, // Decrements the value and clamps to zero. DECREMENT_AND_CLAMP, // Bitwise inverts the value. INVERT, // Increments the value and wraps it to zero when incrementing the maximum // representable unsigned value. INCREMENT_AND_WRAP, // Decrements the value and wraps it to the maximum representable unsigned // value when decrementing zero. DECREMENT_AND_WRAP }; struct SStencilOpState { StencilOp failOp; StencilOp passOp; StencilOp depthFailOp; CompareOp compareOp; }; struct SDepthStencilStateDesc { bool depthTestEnabled; CompareOp depthCompareOp; bool depthWriteEnabled; bool stencilTestEnabled; uint32_t stencilReadMask; uint32_t stencilWriteMask; uint32_t stencilReference; SStencilOpState stencilFrontFace; SStencilOpState stencilBackFace; }; // TODO: add per constant description. enum class BlendFactor { ZERO, ONE, SRC_COLOR, ONE_MINUS_SRC_COLOR, DST_COLOR, ONE_MINUS_DST_COLOR, SRC_ALPHA, ONE_MINUS_SRC_ALPHA, DST_ALPHA, ONE_MINUS_DST_ALPHA, CONSTANT_COLOR, ONE_MINUS_CONSTANT_COLOR, CONSTANT_ALPHA, ONE_MINUS_CONSTANT_ALPHA, SRC_ALPHA_SATURATE, SRC1_COLOR, ONE_MINUS_SRC1_COLOR, SRC1_ALPHA, ONE_MINUS_SRC1_ALPHA, }; enum class BlendOp { ADD, SUBTRACT, REVERSE_SUBTRACT, MIN, MAX }; // Using a namespace instead of a enum allows using the same syntax while // avoiding adding operator overrides and additional checks on casts. namespace ColorWriteMask { constexpr uint8_t RED = 0x01; constexpr uint8_t GREEN = 0x02; constexpr uint8_t BLUE = 0x04; constexpr uint8_t ALPHA = 0x08; } // namespace ColorWriteMask struct SBlendStateDesc { bool enabled; BlendFactor srcColorBlendFactor; BlendFactor dstColorBlendFactor; BlendOp colorBlendOp; BlendFactor srcAlphaBlendFactor; BlendFactor dstAlphaBlendFactor; BlendOp alphaBlendOp; CColor constant; uint8_t colorWriteMask; }; enum class PolygonMode { FILL, LINE }; enum class CullMode { NONE, FRONT, BACK }; enum class FrontFace { COUNTER_CLOCKWISE, CLOCKWISE }; struct SRasterizationStateDesc { PolygonMode polygonMode; CullMode cullMode; FrontFace frontFace; bool depthBiasEnabled; float depthBiasConstantFactor; float depthBiasSlopeFactor; }; struct SGraphicsPipelineStateDesc { // It's a backend client reponsibility to keep the shader program alive // while it's bound. IShaderProgram* shaderProgram; SDepthStencilStateDesc depthStencilState; SBlendStateDesc blendState; SRasterizationStateDesc rasterizationState; }; +struct SComputePipelineStateDesc +{ + // It's a backend client reponsibility to keep the shader program alive + // while it's bound. + IShaderProgram* shaderProgram; +}; + // We don't provide additional helpers intentionally because all custom states // should be described with a related shader and should be switched together. SGraphicsPipelineStateDesc MakeDefaultGraphicsPipelineStateDesc(); StencilOp ParseStencilOp(const CStr& str); BlendFactor ParseBlendFactor(const CStr& str); BlendOp ParseBlendOp(const CStr& str); PolygonMode ParsePolygonMode(const CStr& str); CullMode ParseCullMode(const CStr& str); FrontFace ParseFrontFace(const CStr& str); /** * A holder for precompiled graphics pipeline description. */ class IGraphicsPipelineState : public IDeviceObject { public: virtual IShaderProgram* GetShaderProgram() const = 0; }; +/** + * A holder for precompiled compute pipeline description. + */ +class IComputePipelineState : public IDeviceObject +{ +public: + virtual IShaderProgram* GetShaderProgram() const = 0; +}; + } // namespace Backend } // namespace Renderer #endif // INCLUDED_RENDERER_BACKEND_PIPELINESTATE Index: ps/trunk/source/renderer/backend/dummy/DeviceCommandContext.cpp =================================================================== --- ps/trunk/source/renderer/backend/dummy/DeviceCommandContext.cpp (revision 28009) +++ ps/trunk/source/renderer/backend/dummy/DeviceCommandContext.cpp (revision 28010) @@ -1,218 +1,239 @@ -/* Copyright (C) 2023 Wildfire Games. +/* Copyright (C) 2024 Wildfire Games. * This file is part of 0 A.D. * * 0 A.D. is free software: you can redistribute it and/or modify * it under the terms of the GNU General Public License as published by * the Free Software Foundation, either version 2 of the License, or * (at your option) any later version. * * 0 A.D. is distributed in the hope that it will be useful, * but WITHOUT ANY WARRANTY; without even the implied warranty of * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the * GNU General Public License for more details. * * You should have received a copy of the GNU General Public License * along with 0 A.D. If not, see . */ #include "precompiled.h" #include "DeviceCommandContext.h" #include "renderer/backend/dummy/Buffer.h" #include "renderer/backend/dummy/Device.h" #include "renderer/backend/dummy/Framebuffer.h" #include "renderer/backend/dummy/ShaderProgram.h" #include "renderer/backend/dummy/Texture.h" namespace Renderer { namespace Backend { namespace Dummy { // static std::unique_ptr CDeviceCommandContext::Create(CDevice* device) { std::unique_ptr deviceCommandContext(new CDeviceCommandContext()); deviceCommandContext->m_Device = device; return deviceCommandContext; } CDeviceCommandContext::CDeviceCommandContext() = default; CDeviceCommandContext::~CDeviceCommandContext() = default; IDevice* CDeviceCommandContext::GetDevice() { return m_Device; } void CDeviceCommandContext::SetGraphicsPipelineState( IGraphicsPipelineState*) { } +void CDeviceCommandContext::SetComputePipelineState( + IComputePipelineState*) +{ +} + void CDeviceCommandContext::UploadTexture( ITexture*, const Format, const void*, const size_t, const uint32_t, const uint32_t) { } void CDeviceCommandContext::UploadTextureRegion( ITexture*, const Format, const void*, const size_t, const uint32_t, const uint32_t, const uint32_t, const uint32_t, const uint32_t, const uint32_t) { } void CDeviceCommandContext::UploadBuffer(IBuffer*, const void*, const uint32_t) { } void CDeviceCommandContext::UploadBuffer(IBuffer*, const UploadBufferFunction&) { } void CDeviceCommandContext::UploadBufferRegion( IBuffer*, const void*, const uint32_t, const uint32_t) { } void CDeviceCommandContext::UploadBufferRegion( IBuffer*, const uint32_t, const uint32_t, const UploadBufferFunction&) { } void CDeviceCommandContext::BeginScopedLabel(const char*) { } void CDeviceCommandContext::EndScopedLabel() { } void CDeviceCommandContext::Flush() { } void CDeviceCommandContext::BlitFramebuffer( IFramebuffer*, IFramebuffer*, const Rect&, const Rect&, const Sampler::Filter) { } void CDeviceCommandContext::ResolveFramebuffer(IFramebuffer*, IFramebuffer*) { } void CDeviceCommandContext::ClearFramebuffer(const bool, const bool, const bool) { } void CDeviceCommandContext::BeginFramebufferPass(IFramebuffer*) { } void CDeviceCommandContext::EndFramebufferPass() { } void CDeviceCommandContext::ReadbackFramebufferSync( const uint32_t, const uint32_t, const uint32_t, const uint32_t, void*) { } void CDeviceCommandContext::SetScissors(const uint32_t, const Rect*) { } void CDeviceCommandContext::SetViewports(const uint32_t, const Rect*) { } void CDeviceCommandContext::SetVertexInputLayout(IVertexInputLayout*) { } void CDeviceCommandContext::SetVertexBuffer(const uint32_t, IBuffer*, const uint32_t) { } void CDeviceCommandContext::SetVertexBufferData( const uint32_t, const void*, const uint32_t) { } void CDeviceCommandContext::SetIndexBuffer(IBuffer*) { } void CDeviceCommandContext::SetIndexBufferData(const void*, const uint32_t) { } void CDeviceCommandContext::BeginPass() { } void CDeviceCommandContext::EndPass() { } void CDeviceCommandContext::Draw(const uint32_t, const uint32_t) { } void CDeviceCommandContext::DrawIndexed(const uint32_t, const uint32_t, const int32_t) { } void CDeviceCommandContext::DrawInstanced( const uint32_t, const uint32_t, const uint32_t, const uint32_t) { } void CDeviceCommandContext::DrawIndexedInstanced( const uint32_t, const uint32_t, const uint32_t, const uint32_t, const int32_t) { } void CDeviceCommandContext::DrawIndexedInRange( const uint32_t, const uint32_t, const uint32_t, const uint32_t) { } +void CDeviceCommandContext::BeginComputePass() +{ +} + +void CDeviceCommandContext::EndComputePass() +{ +} + +void CDeviceCommandContext::Dispatch(const uint32_t, const uint32_t, const uint32_t) +{ +} + void CDeviceCommandContext::SetTexture(const int32_t, ITexture*) { } +void CDeviceCommandContext::SetStorageTexture(const int32_t, ITexture*) +{ +} + void CDeviceCommandContext::SetUniform(const int32_t, const float) { } void CDeviceCommandContext::SetUniform(const int32_t, const float, const float) { } void CDeviceCommandContext::SetUniform( const int32_t, const float, const float, const float) { } void CDeviceCommandContext::SetUniform( const int32_t, const float, const float, const float, const float) { } void CDeviceCommandContext::SetUniform(const int32_t, PS::span) { } } // namespace Dummy } // namespace Backend } // namespace Renderer Index: ps/trunk/source/renderer/backend/dummy/PipelineState.h =================================================================== --- ps/trunk/source/renderer/backend/dummy/PipelineState.h (revision 28009) +++ ps/trunk/source/renderer/backend/dummy/PipelineState.h (revision 28010) @@ -1,67 +1,89 @@ -/* Copyright (C) 2022 Wildfire Games. +/* Copyright (C) 2024 Wildfire Games. * This file is part of 0 A.D. * * 0 A.D. is free software: you can redistribute it and/or modify * it under the terms of the GNU General Public License as published by * the Free Software Foundation, either version 2 of the License, or * (at your option) any later version. * * 0 A.D. is distributed in the hope that it will be useful, * but WITHOUT ANY WARRANTY; without even the implied warranty of * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the * GNU General Public License for more details. * * You should have received a copy of the GNU General Public License * along with 0 A.D. If not, see . */ #ifndef INCLUDED_RENDERER_BACKEND_DUMMY_PIPELINESTATE #define INCLUDED_RENDERER_BACKEND_DUMMY_PIPELINESTATE #include "renderer/backend/PipelineState.h" #include #include namespace Renderer { namespace Backend { namespace Dummy { class CDevice; class CGraphicsPipelineState final : public IGraphicsPipelineState { public: ~CGraphicsPipelineState() override = default; IDevice* GetDevice() override; IShaderProgram* GetShaderProgram() const override { return m_Desc.shaderProgram; } const SGraphicsPipelineStateDesc& GetDesc() const { return m_Desc; } private: friend class CDevice; static std::unique_ptr Create( CDevice* device, const SGraphicsPipelineStateDesc& desc); CGraphicsPipelineState() = default; CDevice* m_Device = nullptr; SGraphicsPipelineStateDesc m_Desc{}; }; +class CComputePipelineState final : public IComputePipelineState +{ +public: + ~CComputePipelineState() override = default; + + IDevice* GetDevice() override; + + IShaderProgram* GetShaderProgram() const override { return m_Desc.shaderProgram; } + +private: + friend class CDevice; + + static std::unique_ptr Create( + CDevice* device, const SComputePipelineStateDesc& desc); + + CComputePipelineState() = default; + + CDevice* m_Device = nullptr; + + SComputePipelineStateDesc m_Desc{}; +}; + } // namespace Dummy } // namespace Backend } // namespace Renderer #endif // INCLUDED_RENDERER_BACKEND_DUMMY_PIPELINESTATE Index: ps/trunk/source/renderer/backend/gl/DeviceCommandContext.cpp =================================================================== --- ps/trunk/source/renderer/backend/gl/DeviceCommandContext.cpp (revision 28009) +++ ps/trunk/source/renderer/backend/gl/DeviceCommandContext.cpp (revision 28010) @@ -1,1346 +1,1416 @@ -/* Copyright (C) 2023 Wildfire Games. +/* Copyright (C) 2024 Wildfire Games. * This file is part of 0 A.D. * * 0 A.D. is free software: you can redistribute it and/or modify * it under the terms of the GNU General Public License as published by * the Free Software Foundation, either version 2 of the License, or * (at your option) any later version. * * 0 A.D. is distributed in the hope that it will be useful, * but WITHOUT ANY WARRANTY; without even the implied warranty of * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the * GNU General Public License for more details. * * You should have received a copy of the GNU General Public License * along with 0 A.D. If not, see . */ #include "precompiled.h" #include "DeviceCommandContext.h" #include "ps/CLogger.h" #include "renderer/backend/gl/Buffer.h" #include "renderer/backend/gl/Device.h" #include "renderer/backend/gl/Framebuffer.h" #include "renderer/backend/gl/Mapping.h" #include "renderer/backend/gl/PipelineState.h" #include "renderer/backend/gl/ShaderProgram.h" #include "renderer/backend/gl/Texture.h" #include #include #include namespace Renderer { namespace Backend { namespace GL { namespace { bool operator==(const SStencilOpState& lhs, const SStencilOpState& rhs) { return lhs.failOp == rhs.failOp && lhs.passOp == rhs.passOp && lhs.depthFailOp == rhs.depthFailOp && lhs.compareOp == rhs.compareOp; } bool operator!=(const SStencilOpState& lhs, const SStencilOpState& rhs) { return !operator==(lhs, rhs); } bool operator==( const CDeviceCommandContext::Rect& lhs, const CDeviceCommandContext::Rect& rhs) { return lhs.x == rhs.x && lhs.y == rhs.y && lhs.width == rhs.width && lhs.height == rhs.height; } bool operator!=( const CDeviceCommandContext::Rect& lhs, const CDeviceCommandContext::Rect& rhs) { return !operator==(lhs, rhs); } void ApplyDepthMask(const bool depthWriteEnabled) { glDepthMask(depthWriteEnabled ? GL_TRUE : GL_FALSE); } void ApplyColorMask(const uint8_t colorWriteMask) { glColorMask( (colorWriteMask & ColorWriteMask::RED) != 0 ? GL_TRUE : GL_FALSE, (colorWriteMask & ColorWriteMask::GREEN) != 0 ? GL_TRUE : GL_FALSE, (colorWriteMask & ColorWriteMask::BLUE) != 0 ? GL_TRUE : GL_FALSE, (colorWriteMask & ColorWriteMask::ALPHA) != 0 ? GL_TRUE : GL_FALSE); } void ApplyStencilMask(const uint32_t stencilWriteMask) { glStencilMask(stencilWriteMask); } GLenum BufferTypeToGLTarget(const CBuffer::Type type) { GLenum target = GL_ARRAY_BUFFER; switch (type) { case CBuffer::Type::VERTEX: target = GL_ARRAY_BUFFER; break; case CBuffer::Type::INDEX: target = GL_ELEMENT_ARRAY_BUFFER; break; case CBuffer::Type::UPLOAD: case CBuffer::Type::UNIFORM: debug_warn("Unsupported buffer type."); break; }; return target; } void UploadDynamicBufferRegionImpl( const GLenum target, const uint32_t bufferSize, const uint32_t dataOffset, const uint32_t dataSize, const CDeviceCommandContext::UploadBufferFunction& uploadFunction) { ENSURE(dataOffset < dataSize); // Tell the driver that it can reallocate the whole VBO glBufferDataARB(target, bufferSize, nullptr, GL_DYNAMIC_DRAW); ogl_WarnIfError(); while (true) { // (In theory, glMapBufferRange with GL_MAP_INVALIDATE_BUFFER_BIT could be used // here instead of glBufferData(..., NULL, ...) plus glMapBuffer(), but with // current Intel Windows GPU drivers (as of 2015-01) it's much faster if you do // the explicit glBufferData.) void* mappedData = glMapBufferARB(target, GL_WRITE_ONLY); if (mappedData == nullptr) { // This shouldn't happen unless we run out of virtual address space LOGERROR("glMapBuffer failed"); break; } uploadFunction(static_cast(mappedData) + dataOffset); if (glUnmapBufferARB(target) == GL_TRUE) break; // Unmap might fail on e.g. resolution switches, so just try again // and hope it will eventually succeed LOGMESSAGE("glUnmapBuffer failed, trying again...\n"); } } /** * In case we don't need a framebuffer content (because of the following clear * or overwriting by a shader) we might give a hint to a driver via * glInvalidateFramebuffer. */ void InvalidateFramebuffer( CFramebuffer* framebuffer, const bool color, const bool depthStencil) { GLsizei numberOfAttachments = 0; GLenum attachments[8]; const bool isBackbuffer = framebuffer->GetHandle() == 0; if (color && (framebuffer->GetAttachmentMask() & GL_COLOR_BUFFER_BIT)) { if (isBackbuffer) #if CONFIG2_GLES attachments[numberOfAttachments++] = GL_COLOR_EXT; #else attachments[numberOfAttachments++] = GL_COLOR; #endif else attachments[numberOfAttachments++] = GL_COLOR_ATTACHMENT0; } if (depthStencil) { if (isBackbuffer) { if (framebuffer->GetAttachmentMask() & GL_DEPTH_BUFFER_BIT) #if CONFIG2_GLES attachments[numberOfAttachments++] = GL_DEPTH_EXT; #else attachments[numberOfAttachments++] = GL_DEPTH; #endif if (framebuffer->GetAttachmentMask() & GL_STENCIL_BUFFER_BIT) #if CONFIG2_GLES attachments[numberOfAttachments++] = GL_STENCIL_EXT; #else attachments[numberOfAttachments++] = GL_STENCIL; #endif } else { if (framebuffer->GetAttachmentMask() & GL_DEPTH_BUFFER_BIT) attachments[numberOfAttachments++] = GL_DEPTH_ATTACHMENT; if (framebuffer->GetAttachmentMask() & GL_STENCIL_BUFFER_BIT) attachments[numberOfAttachments++] = GL_STENCIL_ATTACHMENT; } } if (numberOfAttachments > 0) { #if CONFIG2_GLES glDiscardFramebufferEXT(GL_FRAMEBUFFER_EXT, numberOfAttachments, attachments); #else glInvalidateFramebuffer(GL_FRAMEBUFFER_EXT, numberOfAttachments, attachments); #endif ogl_WarnIfError(); } } } // anonymous namespace // static std::unique_ptr CDeviceCommandContext::Create(CDevice* device) { std::unique_ptr deviceCommandContext(new CDeviceCommandContext(device)); deviceCommandContext->m_Framebuffer = device->GetCurrentBackbuffer( Renderer::Backend::AttachmentLoadOp::DONT_CARE, Renderer::Backend::AttachmentStoreOp::DONT_CARE, Renderer::Backend::AttachmentLoadOp::DONT_CARE, Renderer::Backend::AttachmentStoreOp::DONT_CARE)->As(); deviceCommandContext->ResetStates(); return deviceCommandContext; } CDeviceCommandContext::CDeviceCommandContext(CDevice* device) : m_Device(device) { glActiveTexture(GL_TEXTURE0); glBindTexture(GL_TEXTURE_2D, 0); for (BindUnit& unit : m_BoundTextures) { unit.target = GL_TEXTURE_2D; unit.handle = 0; } for (size_t index = 0; index < m_VertexAttributeFormat.size(); ++index) { m_VertexAttributeFormat[index].active = false; m_VertexAttributeFormat[index].initialized = false; m_VertexAttributeFormat[index].bindingSlot = 0; } for (size_t index = 0; index < m_BoundBuffers.size(); ++index) { const CBuffer::Type type = static_cast(index); const GLenum target = BufferTypeToGLTarget(type); const GLuint handle = 0; m_BoundBuffers[index].first = target; m_BoundBuffers[index].second = handle; } } CDeviceCommandContext::~CDeviceCommandContext() = default; IDevice* CDeviceCommandContext::GetDevice() { return m_Device; } void CDeviceCommandContext::SetGraphicsPipelineState( const SGraphicsPipelineStateDesc& pipelineState) { + ENSURE(!pipelineState.shaderProgram || m_InsideFramebufferPass); SetGraphicsPipelineStateImpl(pipelineState, false); } void CDeviceCommandContext::SetGraphicsPipelineState( IGraphicsPipelineState* pipelineState) { + ENSURE(!pipelineState->GetShaderProgram() || m_InsideFramebufferPass); ENSURE(pipelineState); SetGraphicsPipelineStateImpl( pipelineState->As()->GetDesc(), false); } +void CDeviceCommandContext::SetComputePipelineState( + IComputePipelineState* pipelineState) +{ + ENSURE(m_InsideComputePass); + ENSURE(pipelineState); + const SComputePipelineStateDesc& desc = pipelineState->As()->GetDesc(); + if (m_ComputePipelineStateDesc.shaderProgram != desc.shaderProgram) + { + CShaderProgram* currentShaderProgram = nullptr; + if (m_ComputePipelineStateDesc.shaderProgram) + currentShaderProgram = m_ComputePipelineStateDesc.shaderProgram->As(); + CShaderProgram* nextShaderProgram = nullptr; + if (desc.shaderProgram) + nextShaderProgram = desc.shaderProgram->As(); + + if (nextShaderProgram) + nextShaderProgram->Bind(currentShaderProgram); + else if (currentShaderProgram) + currentShaderProgram->Unbind(); + + m_ShaderProgram = nextShaderProgram; + } +} + void CDeviceCommandContext::UploadTexture( ITexture* texture, const Format format, const void* data, const size_t dataSize, const uint32_t level, const uint32_t layer) { UploadTextureRegion(texture, format, data, dataSize, 0, 0, std::max(1u, texture->GetWidth() >> level), std::max(1u, texture->GetHeight() >> level), level, layer); } void CDeviceCommandContext::UploadTextureRegion( ITexture* destinationTexture, const Format dataFormat, const void* data, const size_t dataSize, const uint32_t xOffset, const uint32_t yOffset, const uint32_t width, const uint32_t height, const uint32_t level, const uint32_t layer) { ENSURE(destinationTexture); CTexture* texture = destinationTexture->As(); ENSURE(texture->GetUsage() & Renderer::Backend::ITexture::Usage::TRANSFER_DST); ENSURE(width > 0 && height > 0); if (texture->GetType() == CTexture::Type::TEXTURE_2D) { ENSURE(layer == 0); if (texture->GetFormat() == Format::R8G8B8A8_UNORM || texture->GetFormat() == Format::R8G8B8_UNORM || #if !CONFIG2_GLES texture->GetFormat() == Format::R8_UNORM || #endif texture->GetFormat() == Format::A8_UNORM) { ENSURE(texture->GetFormat() == dataFormat); size_t bytesPerPixel = 4; GLenum pixelFormat = GL_RGBA; switch (dataFormat) { case Format::R8G8B8A8_UNORM: break; case Format::R8G8B8_UNORM: pixelFormat = GL_RGB; bytesPerPixel = 3; break; #if !CONFIG2_GLES case Format::R8_UNORM: pixelFormat = GL_RED; bytesPerPixel = 1; break; #endif case Format::A8_UNORM: pixelFormat = GL_ALPHA; bytesPerPixel = 1; break; case Format::L8_UNORM: pixelFormat = GL_LUMINANCE; bytesPerPixel = 1; break; default: debug_warn("Unexpected format."); break; } ENSURE(dataSize == width * height * bytesPerPixel); ScopedBind scopedBind(this, GL_TEXTURE_2D, texture->GetHandle()); glTexSubImage2D(GL_TEXTURE_2D, level, xOffset, yOffset, width, height, pixelFormat, GL_UNSIGNED_BYTE, data); ogl_WarnIfError(); } else if ( texture->GetFormat() == Format::BC1_RGB_UNORM || texture->GetFormat() == Format::BC1_RGBA_UNORM || texture->GetFormat() == Format::BC2_UNORM || texture->GetFormat() == Format::BC3_UNORM) { ENSURE(xOffset == 0 && yOffset == 0); ENSURE(texture->GetFormat() == dataFormat); // TODO: add data size check. GLenum internalFormat = GL_COMPRESSED_RGB_S3TC_DXT1_EXT; switch (texture->GetFormat()) { case Format::BC1_RGBA_UNORM: internalFormat = GL_COMPRESSED_RGBA_S3TC_DXT1_EXT; break; case Format::BC2_UNORM: internalFormat = GL_COMPRESSED_RGBA_S3TC_DXT3_EXT; break; case Format::BC3_UNORM: internalFormat = GL_COMPRESSED_RGBA_S3TC_DXT5_EXT; break; default: break; } ScopedBind scopedBind(this, GL_TEXTURE_2D, texture->GetHandle()); glCompressedTexImage2DARB(GL_TEXTURE_2D, level, internalFormat, width, height, 0, dataSize, data); ogl_WarnIfError(); } else debug_warn("Unsupported format"); } else if (texture->GetType() == CTexture::Type::TEXTURE_CUBE) { if (texture->GetFormat() == Format::R8G8B8A8_UNORM) { ENSURE(texture->GetFormat() == dataFormat); ENSURE(level == 0 && layer < 6); ENSURE(xOffset == 0 && yOffset == 0 && texture->GetWidth() == width && texture->GetHeight() == height); const size_t bpp = 4; ENSURE(dataSize == width * height * bpp); // The order of layers should be the following: // front, back, top, bottom, right, left static const GLenum targets[6] = { GL_TEXTURE_CUBE_MAP_POSITIVE_X, GL_TEXTURE_CUBE_MAP_NEGATIVE_X, GL_TEXTURE_CUBE_MAP_POSITIVE_Y, GL_TEXTURE_CUBE_MAP_NEGATIVE_Y, GL_TEXTURE_CUBE_MAP_POSITIVE_Z, GL_TEXTURE_CUBE_MAP_NEGATIVE_Z }; ScopedBind scopedBind(this, GL_TEXTURE_CUBE_MAP, texture->GetHandle()); glTexImage2D(targets[layer], level, GL_RGBA, width, height, 0, GL_RGBA, GL_UNSIGNED_BYTE, data); ogl_WarnIfError(); } else debug_warn("Unsupported format"); } else debug_warn("Unsupported type"); } void CDeviceCommandContext::UploadBuffer(IBuffer* buffer, const void* data, const uint32_t dataSize) { ENSURE(!m_InsideFramebufferPass); UploadBufferRegion(buffer, data, dataSize, 0); } void CDeviceCommandContext::UploadBuffer( IBuffer* buffer, const UploadBufferFunction& uploadFunction) { ENSURE(!m_InsideFramebufferPass); UploadBufferRegion(buffer, 0, buffer->GetSize(), uploadFunction); } void CDeviceCommandContext::UploadBufferRegion( IBuffer* buffer, const void* data, const uint32_t dataOffset, const uint32_t dataSize) { ENSURE(!m_InsideFramebufferPass); ENSURE(data); ENSURE(dataOffset + dataSize <= buffer->GetSize()); const GLenum target = BufferTypeToGLTarget(buffer->GetType()); ScopedBufferBind scopedBufferBind(this, buffer->As()); if (buffer->IsDynamic()) { UploadDynamicBufferRegionImpl(target, buffer->GetSize(), dataOffset, dataSize, [data, dataSize](u8* mappedData) { std::memcpy(mappedData, data, dataSize); }); } else { glBufferSubDataARB(target, dataOffset, dataSize, data); ogl_WarnIfError(); } } void CDeviceCommandContext::UploadBufferRegion( IBuffer* buffer, const uint32_t dataOffset, const uint32_t dataSize, const UploadBufferFunction& uploadFunction) { ENSURE(!m_InsideFramebufferPass); ENSURE(dataOffset + dataSize <= buffer->GetSize()); const GLenum target = BufferTypeToGLTarget(buffer->GetType()); ScopedBufferBind scopedBufferBind(this, buffer->As()); ENSURE(buffer->IsDynamic()); UploadDynamicBufferRegionImpl(target, buffer->GetSize(), dataOffset, dataSize, uploadFunction); } void CDeviceCommandContext::BeginScopedLabel(const char* name) { if (!m_Device->GetCapabilities().debugScopedLabels) return; ++m_ScopedLabelDepth; glPushDebugGroup(GL_DEBUG_SOURCE_APPLICATION, 0x0AD, -1, name); } void CDeviceCommandContext::EndScopedLabel() { if (!m_Device->GetCapabilities().debugScopedLabels) return; ENSURE(m_ScopedLabelDepth > 0); --m_ScopedLabelDepth; glPopDebugGroup(); } void CDeviceCommandContext::BindTexture( const uint32_t unit, const GLenum target, const GLuint handle) { ENSURE(unit < m_BoundTextures.size()); #if CONFIG2_GLES ENSURE(target == GL_TEXTURE_2D || target == GL_TEXTURE_CUBE_MAP); #else ENSURE(target == GL_TEXTURE_2D || target == GL_TEXTURE_CUBE_MAP || target == GL_TEXTURE_2D_MULTISAMPLE); #endif if (m_ActiveTextureUnit != unit) { glActiveTexture(GL_TEXTURE0 + unit); m_ActiveTextureUnit = unit; } if (m_BoundTextures[unit].target == target && m_BoundTextures[unit].handle == handle) return; if (m_BoundTextures[unit].target != target && m_BoundTextures[unit].target && m_BoundTextures[unit].handle) glBindTexture(m_BoundTextures[unit].target, 0); if (m_BoundTextures[unit].handle != handle) glBindTexture(target, handle); ogl_WarnIfError(); m_BoundTextures[unit] = {target, handle}; } void CDeviceCommandContext::BindBuffer(const IBuffer::Type type, CBuffer* buffer) { ENSURE(!buffer || buffer->GetType() == type); if (type == IBuffer::Type::VERTEX) { if (m_VertexBuffer == buffer) return; m_VertexBuffer = buffer; } else if (type == IBuffer::Type::INDEX) { if (!buffer) m_IndexBuffer = nullptr; m_IndexBufferData = nullptr; } const GLenum target = BufferTypeToGLTarget(type); const GLuint handle = buffer ? buffer->GetHandle() : 0; glBindBufferARB(target, handle); ogl_WarnIfError(); const size_t cacheIndex = static_cast(type); ENSURE(cacheIndex < m_BoundBuffers.size()); m_BoundBuffers[cacheIndex].second = handle; } void CDeviceCommandContext::OnTextureDestroy(CTexture* texture) { ENSURE(texture); for (size_t index = 0; index < m_BoundTextures.size(); ++index) if (m_BoundTextures[index].handle == texture->GetHandle()) BindTexture(index, GL_TEXTURE_2D, 0); } void CDeviceCommandContext::Flush() { ENSURE(m_ScopedLabelDepth == 0); + ENSURE(!m_InsideFramebufferPass); + ENSURE(!m_InsideComputePass); GPU_SCOPED_LABEL(this, "CDeviceCommandContext::Flush"); ResetStates(); m_IndexBuffer = nullptr; m_IndexBufferData = nullptr; for (size_t unit = 0; unit < m_BoundTextures.size(); ++unit) { if (m_BoundTextures[unit].handle) BindTexture(unit, GL_TEXTURE_2D, 0); } BindBuffer(CBuffer::Type::INDEX, nullptr); BindBuffer(CBuffer::Type::VERTEX, nullptr); } void CDeviceCommandContext::ResetStates() { SetGraphicsPipelineStateImpl(MakeDefaultGraphicsPipelineStateDesc(), true); SetScissors(0, nullptr); m_Framebuffer = m_Device->GetCurrentBackbuffer( Renderer::Backend::AttachmentLoadOp::DONT_CARE, Renderer::Backend::AttachmentStoreOp::DONT_CARE, Renderer::Backend::AttachmentLoadOp::DONT_CARE, Renderer::Backend::AttachmentStoreOp::DONT_CARE)->As(); glBindFramebufferEXT(GL_FRAMEBUFFER_EXT, m_Framebuffer->GetHandle()); ogl_WarnIfError(); } void CDeviceCommandContext::SetGraphicsPipelineStateImpl( const SGraphicsPipelineStateDesc& pipelineStateDesc, const bool force) { ENSURE(!m_InsidePass); if (m_GraphicsPipelineStateDesc.shaderProgram != pipelineStateDesc.shaderProgram) { CShaderProgram* currentShaderProgram = nullptr; if (m_GraphicsPipelineStateDesc.shaderProgram) { currentShaderProgram = static_cast(m_GraphicsPipelineStateDesc.shaderProgram); } CShaderProgram* nextShaderProgram = nullptr; if (pipelineStateDesc.shaderProgram) { nextShaderProgram = static_cast(pipelineStateDesc.shaderProgram); for (size_t index = 0; index < m_VertexAttributeFormat.size(); ++index) { const VertexAttributeStream stream = static_cast(index); m_VertexAttributeFormat[index].active = nextShaderProgram->IsStreamActive(stream); m_VertexAttributeFormat[index].initialized = false; m_VertexAttributeFormat[index].bindingSlot = std::numeric_limits::max(); } } if (nextShaderProgram) nextShaderProgram->Bind(currentShaderProgram); else if (currentShaderProgram) currentShaderProgram->Unbind(); m_ShaderProgram = nextShaderProgram; } const SDepthStencilStateDesc& currentDepthStencilStateDesc = m_GraphicsPipelineStateDesc.depthStencilState; const SDepthStencilStateDesc& nextDepthStencilStateDesc = pipelineStateDesc.depthStencilState; if (force || currentDepthStencilStateDesc.depthTestEnabled != nextDepthStencilStateDesc.depthTestEnabled) { if (nextDepthStencilStateDesc.depthTestEnabled) glEnable(GL_DEPTH_TEST); else glDisable(GL_DEPTH_TEST); } if (force || currentDepthStencilStateDesc.depthCompareOp != nextDepthStencilStateDesc.depthCompareOp) { glDepthFunc(Mapping::FromCompareOp(nextDepthStencilStateDesc.depthCompareOp)); } if (force || currentDepthStencilStateDesc.depthWriteEnabled != nextDepthStencilStateDesc.depthWriteEnabled) { ApplyDepthMask(nextDepthStencilStateDesc.depthWriteEnabled); } if (force || currentDepthStencilStateDesc.stencilTestEnabled != nextDepthStencilStateDesc.stencilTestEnabled) { if (nextDepthStencilStateDesc.stencilTestEnabled) glEnable(GL_STENCIL_TEST); else glDisable(GL_STENCIL_TEST); } if (force || currentDepthStencilStateDesc.stencilFrontFace != nextDepthStencilStateDesc.stencilFrontFace || currentDepthStencilStateDesc.stencilBackFace != nextDepthStencilStateDesc.stencilBackFace) { if (nextDepthStencilStateDesc.stencilFrontFace == nextDepthStencilStateDesc.stencilBackFace) { glStencilOp( Mapping::FromStencilOp(nextDepthStencilStateDesc.stencilFrontFace.failOp), Mapping::FromStencilOp(nextDepthStencilStateDesc.stencilFrontFace.depthFailOp), Mapping::FromStencilOp(nextDepthStencilStateDesc.stencilFrontFace.passOp)); } else { if (force || currentDepthStencilStateDesc.stencilFrontFace != nextDepthStencilStateDesc.stencilFrontFace) { glStencilOpSeparate( GL_FRONT, Mapping::FromStencilOp(nextDepthStencilStateDesc.stencilFrontFace.failOp), Mapping::FromStencilOp(nextDepthStencilStateDesc.stencilFrontFace.depthFailOp), Mapping::FromStencilOp(nextDepthStencilStateDesc.stencilFrontFace.passOp)); } if (force || currentDepthStencilStateDesc.stencilBackFace != nextDepthStencilStateDesc.stencilBackFace) { glStencilOpSeparate( GL_BACK, Mapping::FromStencilOp(nextDepthStencilStateDesc.stencilBackFace.failOp), Mapping::FromStencilOp(nextDepthStencilStateDesc.stencilBackFace.depthFailOp), Mapping::FromStencilOp(nextDepthStencilStateDesc.stencilBackFace.passOp)); } } } if (force || currentDepthStencilStateDesc.stencilWriteMask != nextDepthStencilStateDesc.stencilWriteMask) { ApplyStencilMask(nextDepthStencilStateDesc.stencilWriteMask); } if (force || currentDepthStencilStateDesc.stencilReference != nextDepthStencilStateDesc.stencilReference || currentDepthStencilStateDesc.stencilReadMask != nextDepthStencilStateDesc.stencilReadMask || currentDepthStencilStateDesc.stencilFrontFace.compareOp != nextDepthStencilStateDesc.stencilFrontFace.compareOp || currentDepthStencilStateDesc.stencilBackFace.compareOp != nextDepthStencilStateDesc.stencilBackFace.compareOp) { if (nextDepthStencilStateDesc.stencilFrontFace.compareOp == nextDepthStencilStateDesc.stencilBackFace.compareOp) { glStencilFunc( Mapping::FromCompareOp(nextDepthStencilStateDesc.stencilFrontFace.compareOp), nextDepthStencilStateDesc.stencilReference, nextDepthStencilStateDesc.stencilReadMask); } else { glStencilFuncSeparate(GL_FRONT, Mapping::FromCompareOp(nextDepthStencilStateDesc.stencilFrontFace.compareOp), nextDepthStencilStateDesc.stencilReference, nextDepthStencilStateDesc.stencilReadMask); glStencilFuncSeparate(GL_BACK, Mapping::FromCompareOp(nextDepthStencilStateDesc.stencilBackFace.compareOp), nextDepthStencilStateDesc.stencilReference, nextDepthStencilStateDesc.stencilReadMask); } } const SBlendStateDesc& currentBlendStateDesc = m_GraphicsPipelineStateDesc.blendState; const SBlendStateDesc& nextBlendStateDesc = pipelineStateDesc.blendState; if (force || currentBlendStateDesc.enabled != nextBlendStateDesc.enabled) { if (nextBlendStateDesc.enabled) glEnable(GL_BLEND); else glDisable(GL_BLEND); } if (force || currentBlendStateDesc.srcColorBlendFactor != nextBlendStateDesc.srcColorBlendFactor || currentBlendStateDesc.srcAlphaBlendFactor != nextBlendStateDesc.srcAlphaBlendFactor || currentBlendStateDesc.dstColorBlendFactor != nextBlendStateDesc.dstColorBlendFactor || currentBlendStateDesc.dstAlphaBlendFactor != nextBlendStateDesc.dstAlphaBlendFactor) { if (nextBlendStateDesc.srcColorBlendFactor == nextBlendStateDesc.srcAlphaBlendFactor && nextBlendStateDesc.dstColorBlendFactor == nextBlendStateDesc.dstAlphaBlendFactor) { glBlendFunc( Mapping::FromBlendFactor(nextBlendStateDesc.srcColorBlendFactor), Mapping::FromBlendFactor(nextBlendStateDesc.dstColorBlendFactor)); } else { glBlendFuncSeparate( Mapping::FromBlendFactor(nextBlendStateDesc.srcColorBlendFactor), Mapping::FromBlendFactor(nextBlendStateDesc.dstColorBlendFactor), Mapping::FromBlendFactor(nextBlendStateDesc.srcAlphaBlendFactor), Mapping::FromBlendFactor(nextBlendStateDesc.dstAlphaBlendFactor)); } } if (force || currentBlendStateDesc.colorBlendOp != nextBlendStateDesc.colorBlendOp || currentBlendStateDesc.alphaBlendOp != nextBlendStateDesc.alphaBlendOp) { if (nextBlendStateDesc.colorBlendOp == nextBlendStateDesc.alphaBlendOp) { glBlendEquation(Mapping::FromBlendOp(nextBlendStateDesc.colorBlendOp)); } else { glBlendEquationSeparate( Mapping::FromBlendOp(nextBlendStateDesc.colorBlendOp), Mapping::FromBlendOp(nextBlendStateDesc.alphaBlendOp)); } } if (force || currentBlendStateDesc.constant != nextBlendStateDesc.constant) { glBlendColor( nextBlendStateDesc.constant.r, nextBlendStateDesc.constant.g, nextBlendStateDesc.constant.b, nextBlendStateDesc.constant.a); } if (force || currentBlendStateDesc.colorWriteMask != nextBlendStateDesc.colorWriteMask) { ApplyColorMask(nextBlendStateDesc.colorWriteMask); } const SRasterizationStateDesc& currentRasterizationStateDesc = m_GraphicsPipelineStateDesc.rasterizationState; const SRasterizationStateDesc& nextRasterizationStateDesc = pipelineStateDesc.rasterizationState; if (force || currentRasterizationStateDesc.polygonMode != nextRasterizationStateDesc.polygonMode) { #if !CONFIG2_GLES glPolygonMode( GL_FRONT_AND_BACK, nextRasterizationStateDesc.polygonMode == PolygonMode::LINE ? GL_LINE : GL_FILL); #endif } if (force || currentRasterizationStateDesc.cullMode != nextRasterizationStateDesc.cullMode) { if (nextRasterizationStateDesc.cullMode == CullMode::NONE) { glDisable(GL_CULL_FACE); } else { if (force || currentRasterizationStateDesc.cullMode == CullMode::NONE) glEnable(GL_CULL_FACE); glCullFace(nextRasterizationStateDesc.cullMode == CullMode::FRONT ? GL_FRONT : GL_BACK); } } if (force || currentRasterizationStateDesc.frontFace != nextRasterizationStateDesc.frontFace) { if (nextRasterizationStateDesc.frontFace == FrontFace::CLOCKWISE) glFrontFace(GL_CW); else glFrontFace(GL_CCW); } #if !CONFIG2_GLES if (force || currentRasterizationStateDesc.depthBiasEnabled != nextRasterizationStateDesc.depthBiasEnabled) { if (nextRasterizationStateDesc.depthBiasEnabled) glEnable(GL_POLYGON_OFFSET_FILL); else glDisable(GL_POLYGON_OFFSET_FILL); } if (force || currentRasterizationStateDesc.depthBiasConstantFactor != nextRasterizationStateDesc.depthBiasConstantFactor || currentRasterizationStateDesc.depthBiasSlopeFactor != nextRasterizationStateDesc.depthBiasSlopeFactor) { glPolygonOffset( nextRasterizationStateDesc.depthBiasSlopeFactor, nextRasterizationStateDesc.depthBiasConstantFactor); } #endif ogl_WarnIfError(); m_GraphicsPipelineStateDesc = pipelineStateDesc; } void CDeviceCommandContext::BlitFramebuffer( IFramebuffer* srcFramebuffer, IFramebuffer* dstFramebuffer, const Rect& sourceRegion, const Rect& destinationRegion, const Sampler::Filter filter) { ENSURE(!m_InsideFramebufferPass); CFramebuffer* destinationFramebuffer = dstFramebuffer->As(); CFramebuffer* sourceFramebuffer = srcFramebuffer->As(); #if CONFIG2_GLES UNUSED2(destinationFramebuffer); UNUSED2(sourceFramebuffer); UNUSED2(destinationRegion); UNUSED2(sourceRegion); UNUSED2(filter); debug_warn("CDeviceCommandContext::BlitFramebuffer is not implemented for GLES"); #else // Source framebuffer should not be backbuffer. ENSURE(sourceFramebuffer->GetHandle() != 0); ENSURE(destinationFramebuffer != sourceFramebuffer); glBindFramebufferEXT(GL_READ_FRAMEBUFFER_EXT, sourceFramebuffer->GetHandle()); glBindFramebufferEXT(GL_DRAW_FRAMEBUFFER_EXT, destinationFramebuffer->GetHandle()); // TODO: add more check for internal formats. glBlitFramebufferEXT( sourceRegion.x, sourceRegion.y, sourceRegion.width, sourceRegion.height, destinationRegion.x, destinationRegion.y, destinationRegion.width, destinationRegion.height, (sourceFramebuffer->GetAttachmentMask() & destinationFramebuffer->GetAttachmentMask()), filter == Sampler::Filter::LINEAR ? GL_LINEAR : GL_NEAREST); ogl_WarnIfError(); #endif } void CDeviceCommandContext::ResolveFramebuffer( IFramebuffer* srcFramebuffer, IFramebuffer* dstFramebuffer) { ENSURE(!m_InsideFramebufferPass); CFramebuffer* destinationFramebuffer = dstFramebuffer->As(); CFramebuffer* sourceFramebuffer = srcFramebuffer->As(); ENSURE(destinationFramebuffer->GetWidth() == sourceFramebuffer->GetWidth()); ENSURE(destinationFramebuffer->GetHeight() == sourceFramebuffer->GetHeight()); #if CONFIG2_GLES UNUSED2(destinationFramebuffer); UNUSED2(sourceFramebuffer); debug_warn("CDeviceCommandContext::ResolveFramebuffer is not implemented for GLES"); #else // Source framebuffer should not be backbuffer. ENSURE(sourceFramebuffer->GetHandle() != 0); ENSURE(destinationFramebuffer != sourceFramebuffer); glBindFramebufferEXT(GL_READ_FRAMEBUFFER_EXT, sourceFramebuffer->GetHandle()); glBindFramebufferEXT(GL_DRAW_FRAMEBUFFER_EXT, destinationFramebuffer->GetHandle()); glBlitFramebufferEXT( 0, 0, sourceFramebuffer->GetWidth(), sourceFramebuffer->GetHeight(), 0, 0, sourceFramebuffer->GetWidth(), sourceFramebuffer->GetHeight(), (sourceFramebuffer->GetAttachmentMask() & destinationFramebuffer->GetAttachmentMask()), GL_NEAREST); ogl_WarnIfError(); #endif } void CDeviceCommandContext::ClearFramebuffer(const bool color, const bool depth, const bool stencil) { ENSURE(m_InsideFramebufferPass); const bool needsColor = color && (m_Framebuffer->GetAttachmentMask() & GL_COLOR_BUFFER_BIT) != 0; const bool needsDepth = depth && (m_Framebuffer->GetAttachmentMask() & GL_DEPTH_BUFFER_BIT) != 0; const bool needsStencil = stencil && (m_Framebuffer->GetAttachmentMask() & GL_STENCIL_BUFFER_BIT) != 0; GLbitfield mask = 0; if (needsColor) { ApplyColorMask(ColorWriteMask::RED | ColorWriteMask::GREEN | ColorWriteMask::BLUE | ColorWriteMask::ALPHA); glClearColor( m_Framebuffer->GetClearColor().r, m_Framebuffer->GetClearColor().g, m_Framebuffer->GetClearColor().b, m_Framebuffer->GetClearColor().a); mask |= GL_COLOR_BUFFER_BIT; } if (needsDepth) { ApplyDepthMask(true); mask |= GL_DEPTH_BUFFER_BIT; } if (needsStencil) { ApplyStencilMask(std::numeric_limits::max()); mask |= GL_STENCIL_BUFFER_BIT; } glClear(mask); ogl_WarnIfError(); if (needsColor) ApplyColorMask(m_GraphicsPipelineStateDesc.blendState.colorWriteMask); if (needsDepth) ApplyDepthMask(m_GraphicsPipelineStateDesc.depthStencilState.depthWriteEnabled); if (needsStencil) ApplyStencilMask(m_GraphicsPipelineStateDesc.depthStencilState.stencilWriteMask); } void CDeviceCommandContext::BeginFramebufferPass(IFramebuffer* framebuffer) { SetGraphicsPipelineStateImpl( MakeDefaultGraphicsPipelineStateDesc(), false); ENSURE(!m_InsideFramebufferPass); m_InsideFramebufferPass = true; ENSURE(framebuffer); m_Framebuffer = framebuffer->As(); ENSURE(m_Framebuffer->GetHandle() == 0 || (m_Framebuffer->GetWidth() > 0 && m_Framebuffer->GetHeight() > 0)); glBindFramebufferEXT(GL_FRAMEBUFFER_EXT, m_Framebuffer->GetHandle()); ogl_WarnIfError(); if (m_Device->UseFramebufferInvalidating()) { InvalidateFramebuffer( m_Framebuffer, m_Framebuffer->GetColorAttachmentLoadOp() != AttachmentLoadOp::LOAD, m_Framebuffer->GetDepthStencilAttachmentLoadOp() != AttachmentLoadOp::LOAD); } const bool needsClearColor = m_Framebuffer->GetColorAttachmentLoadOp() == AttachmentLoadOp::CLEAR; const bool needsClearDepthStencil = m_Framebuffer->GetDepthStencilAttachmentLoadOp() == AttachmentLoadOp::CLEAR; if (needsClearColor || needsClearDepthStencil) { ClearFramebuffer( needsClearColor, needsClearDepthStencil, needsClearDepthStencil); } } void CDeviceCommandContext::EndFramebufferPass() { if (m_Device->UseFramebufferInvalidating()) { InvalidateFramebuffer( m_Framebuffer, m_Framebuffer->GetColorAttachmentStoreOp() != AttachmentStoreOp::STORE, m_Framebuffer->GetDepthStencilAttachmentStoreOp() != AttachmentStoreOp::STORE); } ENSURE(m_InsideFramebufferPass); m_InsideFramebufferPass = false; CFramebuffer* framebuffer = m_Device->GetCurrentBackbuffer( Renderer::Backend::AttachmentLoadOp::DONT_CARE, Renderer::Backend::AttachmentStoreOp::DONT_CARE, Renderer::Backend::AttachmentLoadOp::DONT_CARE, Renderer::Backend::AttachmentStoreOp::DONT_CARE)->As(); if (framebuffer->GetHandle() != m_Framebuffer->GetHandle()) { glBindFramebufferEXT(GL_FRAMEBUFFER_EXT, framebuffer->GetHandle()); ogl_WarnIfError(); } m_Framebuffer = framebuffer; + + SetGraphicsPipelineStateImpl(MakeDefaultGraphicsPipelineStateDesc(), false); } void CDeviceCommandContext::ReadbackFramebufferSync( const uint32_t x, const uint32_t y, const uint32_t width, const uint32_t height, void* data) { ENSURE(m_Framebuffer); glReadPixels(x, y, width, height, GL_RGB, GL_UNSIGNED_BYTE, data); ogl_WarnIfError(); } void CDeviceCommandContext::SetScissors(const uint32_t scissorCount, const Rect* scissors) { ENSURE(scissorCount <= 1); if (scissorCount == 0) { if (m_ScissorCount != scissorCount) glDisable(GL_SCISSOR_TEST); } else { if (m_ScissorCount != scissorCount) glEnable(GL_SCISSOR_TEST); ENSURE(scissors); if (m_ScissorCount != scissorCount || m_Scissors[0] != scissors[0]) { m_Scissors[0] = scissors[0]; glScissor(m_Scissors[0].x, m_Scissors[0].y, m_Scissors[0].width, m_Scissors[0].height); } } ogl_WarnIfError(); m_ScissorCount = scissorCount; } void CDeviceCommandContext::SetViewports(const uint32_t viewportCount, const Rect* viewports) { ENSURE(m_InsideFramebufferPass); ENSURE(viewportCount == 1); glViewport(viewports[0].x, viewports[0].y, viewports[0].width, viewports[0].height); ogl_WarnIfError(); } void CDeviceCommandContext::SetVertexInputLayout( IVertexInputLayout* vertexInputLayout) { ENSURE(vertexInputLayout); for (const SVertexAttributeFormat& attribute : vertexInputLayout->As()->GetAttributes()) { const uint32_t index = static_cast(attribute.stream); ENSURE(index < m_VertexAttributeFormat.size()); ENSURE(attribute.bindingSlot < m_VertexAttributeFormat.size()); if (!m_VertexAttributeFormat[index].active) continue; m_VertexAttributeFormat[index].format = attribute.format; m_VertexAttributeFormat[index].offset = attribute.offset; m_VertexAttributeFormat[index].stride = attribute.stride; m_VertexAttributeFormat[index].rate = attribute.rate; m_VertexAttributeFormat[index].bindingSlot = attribute.bindingSlot; m_VertexAttributeFormat[index].initialized = true; } } void CDeviceCommandContext::SetVertexBuffer( const uint32_t bindingSlot, IBuffer* buffer, const uint32_t offset) { ENSURE(buffer); ENSURE(buffer->GetType() == IBuffer::Type::VERTEX); ENSURE(m_ShaderProgram); BindBuffer(buffer->GetType(), buffer->As()); for (size_t index = 0; index < m_VertexAttributeFormat.size(); ++index) { if (!m_VertexAttributeFormat[index].active || m_VertexAttributeFormat[index].bindingSlot != bindingSlot) continue; ENSURE(m_VertexAttributeFormat[index].initialized); const VertexAttributeStream stream = static_cast(index); m_ShaderProgram->VertexAttribPointer(stream, m_VertexAttributeFormat[index].format, m_VertexAttributeFormat[index].offset + offset, m_VertexAttributeFormat[index].stride, m_VertexAttributeFormat[index].rate, nullptr); } } void CDeviceCommandContext::SetVertexBufferData( const uint32_t bindingSlot, const void* data, const uint32_t dataSize) { ENSURE(data); ENSURE(m_ShaderProgram); ENSURE(dataSize > 0); BindBuffer(CBuffer::Type::VERTEX, nullptr); for (size_t index = 0; index < m_VertexAttributeFormat.size(); ++index) { if (!m_VertexAttributeFormat[index].active || m_VertexAttributeFormat[index].bindingSlot != bindingSlot) continue; ENSURE(m_VertexAttributeFormat[index].initialized); const VertexAttributeStream stream = static_cast(index); // We don't know how many vertices will be used in a draw command, so we // assume at least one vertex. ENSURE(dataSize >= m_VertexAttributeFormat[index].offset + m_VertexAttributeFormat[index].stride); m_ShaderProgram->VertexAttribPointer(stream, m_VertexAttributeFormat[index].format, m_VertexAttributeFormat[index].offset, m_VertexAttributeFormat[index].stride, m_VertexAttributeFormat[index].rate, data); } } void CDeviceCommandContext::SetIndexBuffer(IBuffer* buffer) { ENSURE(buffer->GetType() == CBuffer::Type::INDEX); m_IndexBuffer = buffer->As(); m_IndexBufferData = nullptr; BindBuffer(CBuffer::Type::INDEX, m_IndexBuffer); } void CDeviceCommandContext::SetIndexBufferData(const void* data, const uint32_t dataSize) { ENSURE(dataSize > 0); if (m_IndexBuffer) { BindBuffer(CBuffer::Type::INDEX, nullptr); m_IndexBuffer = nullptr; } m_IndexBufferData = data; } void CDeviceCommandContext::BeginPass() { ENSURE(!m_InsidePass); m_InsidePass = true; } void CDeviceCommandContext::EndPass() { ENSURE(m_InsidePass); m_InsidePass = false; } void CDeviceCommandContext::Draw( const uint32_t firstVertex, const uint32_t vertexCount) { ENSURE(m_ShaderProgram); ENSURE(m_InsidePass); // Some drivers apparently don't like count = 0 in glDrawArrays here, so skip // all drawing in that case. if (vertexCount == 0) return; m_ShaderProgram->AssertPointersBound(); glDrawArrays(GL_TRIANGLES, firstVertex, vertexCount); ogl_WarnIfError(); } void CDeviceCommandContext::DrawIndexed( const uint32_t firstIndex, const uint32_t indexCount, const int32_t vertexOffset) { ENSURE(m_ShaderProgram); ENSURE(m_InsidePass); if (indexCount == 0) return; ENSURE(m_IndexBuffer || m_IndexBufferData); ENSURE(vertexOffset == 0); if (m_IndexBuffer) { ENSURE(sizeof(uint16_t) * (firstIndex + indexCount) <= m_IndexBuffer->GetSize()); } m_ShaderProgram->AssertPointersBound(); // Don't use glMultiDrawElements here since it doesn't have a significant // performance impact and it suffers from various driver bugs (e.g. it breaks // in Mesa 7.10 swrast with index VBOs). glDrawElements(GL_TRIANGLES, indexCount, GL_UNSIGNED_SHORT, static_cast((static_cast(m_IndexBufferData) + sizeof(uint16_t) * firstIndex))); ogl_WarnIfError(); } void CDeviceCommandContext::DrawInstanced( const uint32_t firstVertex, const uint32_t vertexCount, const uint32_t firstInstance, const uint32_t instanceCount) { ENSURE(m_Device->GetCapabilities().instancing); ENSURE(m_ShaderProgram); ENSURE(m_InsidePass); if (vertexCount == 0 || instanceCount == 0) return; ENSURE(firstInstance == 0); m_ShaderProgram->AssertPointersBound(); #if CONFIG2_GLES ENSURE(!m_Device->GetCapabilities().instancing); UNUSED2(firstVertex); UNUSED2(vertexCount); UNUSED2(instanceCount); #else glDrawArraysInstancedARB(GL_TRIANGLES, firstVertex, vertexCount, instanceCount); #endif ogl_WarnIfError(); } void CDeviceCommandContext::DrawIndexedInstanced( const uint32_t firstIndex, const uint32_t indexCount, const uint32_t firstInstance, const uint32_t instanceCount, const int32_t vertexOffset) { ENSURE(m_Device->GetCapabilities().instancing); ENSURE(m_ShaderProgram); ENSURE(m_InsidePass); ENSURE(m_IndexBuffer || m_IndexBufferData); if (indexCount == 0) return; ENSURE(firstInstance == 0 && vertexOffset == 0); if (m_IndexBuffer) { ENSURE(sizeof(uint16_t) * (firstIndex + indexCount) <= m_IndexBuffer->GetSize()); } m_ShaderProgram->AssertPointersBound(); // Don't use glMultiDrawElements here since it doesn't have a significant // performance impact and it suffers from various driver bugs (e.g. it breaks // in Mesa 7.10 swrast with index VBOs). #if CONFIG2_GLES ENSURE(!m_Device->GetCapabilities().instancing); UNUSED2(indexCount); UNUSED2(firstIndex); UNUSED2(instanceCount); #else glDrawElementsInstancedARB(GL_TRIANGLES, indexCount, GL_UNSIGNED_SHORT, static_cast((static_cast(m_IndexBufferData) + sizeof(uint16_t) * firstIndex)), instanceCount); #endif ogl_WarnIfError(); } void CDeviceCommandContext::DrawIndexedInRange( const uint32_t firstIndex, const uint32_t indexCount, const uint32_t start, const uint32_t end) { ENSURE(m_ShaderProgram); ENSURE(m_InsidePass); if (indexCount == 0) return; ENSURE(m_IndexBuffer || m_IndexBufferData); const void* indices = static_cast((static_cast(m_IndexBufferData) + sizeof(uint16_t) * firstIndex)); m_ShaderProgram->AssertPointersBound(); // Draw with DrawRangeElements where available, since it might be more // efficient for slow hardware. #if CONFIG2_GLES UNUSED2(start); UNUSED2(end); glDrawElements(GL_TRIANGLES, indexCount, GL_UNSIGNED_SHORT, indices); #else glDrawRangeElementsEXT(GL_TRIANGLES, start, end, indexCount, GL_UNSIGNED_SHORT, indices); #endif ogl_WarnIfError(); } +void CDeviceCommandContext::BeginComputePass() +{ + ENSURE(!m_InsideFramebufferPass); + ENSURE(!m_InsideComputePass); + m_InsideComputePass = true; +} + +void CDeviceCommandContext::EndComputePass() +{ + ENSURE(m_InsideComputePass); + m_InsideComputePass = false; +} + +void CDeviceCommandContext::Dispatch( + const uint32_t groupCountX, + const uint32_t groupCountY, + const uint32_t groupCountZ) +{ + ENSURE(m_InsideComputePass); + glDispatchCompute(groupCountX, groupCountY, groupCountZ); + // TODO: we might want to do binding tracking to avoid redundant barriers. + glMemoryBarrier( + GL_SHADER_IMAGE_ACCESS_BARRIER_BIT | GL_TEXTURE_FETCH_BARRIER_BIT | GL_TEXTURE_UPDATE_BARRIER_BIT | GL_FRAMEBUFFER_BARRIER_BIT); +} + void CDeviceCommandContext::SetTexture(const int32_t bindingSlot, ITexture* texture) { ENSURE(m_ShaderProgram); ENSURE(texture); ENSURE(texture->GetUsage() & Renderer::Backend::ITexture::Usage::SAMPLED); const CShaderProgram::TextureUnit textureUnit = m_ShaderProgram->GetTextureUnit(bindingSlot); if (!textureUnit.type) return; if (textureUnit.type != GL_SAMPLER_2D && #if !CONFIG2_GLES textureUnit.type != GL_SAMPLER_2D_SHADOW && #endif textureUnit.type != GL_SAMPLER_CUBE) { LOGERROR("CDeviceCommandContext::SetTexture: expected sampler at binding slot"); return; } #if !CONFIG2_GLES if (textureUnit.type == GL_SAMPLER_2D_SHADOW) { if (!IsDepthFormat(texture->GetFormat())) { LOGERROR("CDeviceCommandContext::SetTexture: Invalid texture type (expected depth texture)"); return; } } #endif ENSURE(textureUnit.unit >= 0); const uint32_t unit = textureUnit.unit; if (unit >= m_BoundTextures.size()) { LOGERROR("CDeviceCommandContext::SetTexture: Invalid texture unit (too big)"); return; } BindTexture(unit, textureUnit.target, texture->As()->GetHandle()); } +void CDeviceCommandContext::SetStorageTexture(const int32_t bindingSlot, ITexture* texture) +{ + ENSURE(m_ShaderProgram); + ENSURE(texture); + ENSURE(texture->GetUsage() & Renderer::Backend::ITexture::Usage::STORAGE); + + const CShaderProgram::TextureUnit textureUnit = + m_ShaderProgram->GetTextureUnit(bindingSlot); + if (!textureUnit.type) + return; + ENSURE(textureUnit.type == GL_IMAGE_2D); + ENSURE(texture->GetFormat() == Format::R8G8B8A8_UNORM); + glBindImageTexture(textureUnit.unit, texture->As()->GetHandle(), 0, GL_FALSE, 0, GL_READ_WRITE, GL_RGBA8); +} + void CDeviceCommandContext::SetUniform( const int32_t bindingSlot, const float value) { ENSURE(m_ShaderProgram); m_ShaderProgram->SetUniform(bindingSlot, value); } void CDeviceCommandContext::SetUniform( const int32_t bindingSlot, const float valueX, const float valueY) { ENSURE(m_ShaderProgram); m_ShaderProgram->SetUniform(bindingSlot, valueX, valueY); } void CDeviceCommandContext::SetUniform( const int32_t bindingSlot, const float valueX, const float valueY, const float valueZ) { ENSURE(m_ShaderProgram); m_ShaderProgram->SetUniform(bindingSlot, valueX, valueY, valueZ); } void CDeviceCommandContext::SetUniform( const int32_t bindingSlot, const float valueX, const float valueY, const float valueZ, const float valueW) { ENSURE(m_ShaderProgram); m_ShaderProgram->SetUniform(bindingSlot, valueX, valueY, valueZ, valueW); } void CDeviceCommandContext::SetUniform( const int32_t bindingSlot, PS::span values) { ENSURE(m_ShaderProgram); m_ShaderProgram->SetUniform(bindingSlot, values); } CDeviceCommandContext::ScopedBind::ScopedBind( CDeviceCommandContext* deviceCommandContext, const GLenum target, const GLuint handle) : m_DeviceCommandContext(deviceCommandContext), m_OldBindUnit(deviceCommandContext->m_BoundTextures[deviceCommandContext->m_ActiveTextureUnit]), m_ActiveTextureUnit(deviceCommandContext->m_ActiveTextureUnit) { const uint32_t unit = m_DeviceCommandContext->m_BoundTextures.size() - 1; m_DeviceCommandContext->BindTexture(unit, target, handle); } CDeviceCommandContext::ScopedBind::~ScopedBind() { m_DeviceCommandContext->BindTexture( m_ActiveTextureUnit, m_OldBindUnit.target, m_OldBindUnit.handle); } CDeviceCommandContext::ScopedBufferBind::ScopedBufferBind( CDeviceCommandContext* deviceCommandContext, CBuffer* buffer) : m_DeviceCommandContext(deviceCommandContext) { ENSURE(buffer); m_CacheIndex = static_cast(buffer->GetType()); const GLenum target = BufferTypeToGLTarget(buffer->GetType()); const GLuint handle = buffer->GetHandle(); if (m_DeviceCommandContext->m_BoundBuffers[m_CacheIndex].first == target && m_DeviceCommandContext->m_BoundBuffers[m_CacheIndex].second == handle) { // Use an invalid index as a sign that we don't need to restore the // bound buffer. m_CacheIndex = m_DeviceCommandContext->m_BoundBuffers.size(); } else { glBindBufferARB(target, handle); } } CDeviceCommandContext::ScopedBufferBind::~ScopedBufferBind() { if (m_CacheIndex >= m_DeviceCommandContext->m_BoundBuffers.size()) return; glBindBufferARB( m_DeviceCommandContext->m_BoundBuffers[m_CacheIndex].first, m_DeviceCommandContext->m_BoundBuffers[m_CacheIndex].second); } } // namespace GL } // namespace Backend } // namespace Renderer Index: ps/trunk/source/graphics/ShaderTechnique.cpp =================================================================== --- ps/trunk/source/graphics/ShaderTechnique.cpp (revision 28009) +++ ps/trunk/source/graphics/ShaderTechnique.cpp (revision 28010) @@ -1,71 +1,97 @@ -/* Copyright (C) 2022 Wildfire Games. +/* Copyright (C) 2024 Wildfire Games. * This file is part of 0 A.D. * * 0 A.D. is free software: you can redistribute it and/or modify * it under the terms of the GNU General Public License as published by * the Free Software Foundation, either version 2 of the License, or * (at your option) any later version. * * 0 A.D. is distributed in the hope that it will be useful, * but WITHOUT ANY WARRANTY; without even the implied warranty of * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the * GNU General Public License for more details. * * You should have received a copy of the GNU General Public License * along with 0 A.D. If not, see . */ #include "precompiled.h" #include "ShaderTechnique.h" #include "graphics/ShaderProgram.h" #include "renderer/backend/IDevice.h" +#include + CShaderPass::CShaderPass( std::unique_ptr pipelineState, const CShaderProgramPtr& shader) : m_Shader(shader), m_PipelineState(std::move(pipelineState)) { ENSURE(shader); } CShaderTechnique::CShaderTechnique( const VfsPath& path, const CShaderDefines& defines, const PipelineStateDescCallback& callback) : m_Path(path), m_Defines(defines), m_PipelineStateDescCallback(callback) { } void CShaderTechnique::SetPasses(std::vector&& passes) { + ENSURE(!m_ComputePipelineState); m_Passes = std::move(passes); } +void CShaderTechnique::SetComputePipelineState( + std::unique_ptr pipelineState, + const CShaderProgramPtr& computeShader) +{ + ENSURE(m_Passes.empty()); + m_ComputePipelineState = std::move(pipelineState); + m_ComputeShader = computeShader; +} + int CShaderTechnique::GetNumPasses() const { return m_Passes.size(); } Renderer::Backend::IShaderProgram* CShaderTechnique::GetShader(int pass) const { - ENSURE(0 <= pass && pass < (int)m_Passes.size()); - return m_Passes[pass].GetPipelineState()->GetShaderProgram(); + if (m_ComputeShader) + { + ENSURE(pass == 0); + return m_ComputeShader->GetBackendShaderProgram(); + } + else + { + ENSURE(0 <= pass && pass < (int)m_Passes.size()); + return m_Passes[pass].GetPipelineState()->GetShaderProgram(); + } } Renderer::Backend::IGraphicsPipelineState* CShaderTechnique::GetGraphicsPipelineState(int pass) const { ENSURE(0 <= pass && pass < static_cast(m_Passes.size())); return m_Passes[pass].GetPipelineState(); } +Renderer::Backend::IComputePipelineState* +CShaderTechnique::GetComputePipelineState() const +{ + return m_ComputePipelineState.get(); +} + bool CShaderTechnique::GetSortByDistance() const { return m_SortByDistance; } void CShaderTechnique::SetSortByDistance(bool enable) { m_SortByDistance = enable; } Index: ps/trunk/source/ps/CStrInternStatic.h =================================================================== --- ps/trunk/source/ps/CStrInternStatic.h (revision 28009) +++ ps/trunk/source/ps/CStrInternStatic.h (revision 28010) @@ -1,196 +1,202 @@ -/* Copyright (C) 2023 Wildfire Games. +/* Copyright (C) 2024 Wildfire Games. * This file is part of 0 A.D. * * 0 A.D. is free software: you can redistribute it and/or modify * it under the terms of the GNU General Public License as published by * the Free Software Foundation, either version 2 of the License, or * (at your option) any later version. * * 0 A.D. is distributed in the hope that it will be useful, * but WITHOUT ANY WARRANTY; without even the implied warranty of * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the * GNU General Public License for more details. * * You should have received a copy of the GNU General Public License * along with 0 A.D. If not, see . */ // This file defines global CStrIntern variables, to avoid the cost of // constructing CStrInterns frequently at runtime. // // A line like // X(foo) // defines a variable str_foo with value "foo". // // A line like // X2(foo_0, "foo[0]") // defines a variable str_foo_0 with value "foo[0]". // For direct inclusion, we presumably just want the extern definitions. #ifndef X #include "CStrIntern.h" #define X(id) extern CStrIntern str_##id; #define X2(id, str) extern CStrIntern str_##id; #endif X(0) X(1) X(2) X(3) X(4) X(ALPHABLEND_PASS_BLEND) X(ALPHABLEND_PASS_OPAQUE) X(BLEND) X(BLOOM_NOP) X(BLOOM_PASS_H) X(BLOOM_PASS_V) X(DECAL) X(DISABLE_RECEIVE_SHADOWS) X(IGNORE_LOS) X(MINIMAP_BASE) X(MINIMAP_POINT) X(MODE_SHADOWCAST) X(MODE_SILHOUETTEDISPLAY) X(MODE_SILHOUETTEOCCLUDER) X(MODE_WIREFRAME) X(MODE_WIREFRAME_SOLID) X(PASS_REFLECTIONS) X(PASS_REFRACTIONS) X(PASS_SHADOWS) X(RENDER_DEBUG_MODE) X(RENDER_DEBUG_MODE_AO) X(RENDER_DEBUG_MODE_ALPHA) X(RENDER_DEBUG_MODE_CUSTOM) X(RENDER_DEBUG_MODE_NONE) X(SHADOWS_CASCADE_COUNT) X(USE_DESCRIPTOR_INDEXING) X(USE_FANCY_EFFECTS) X(USE_FP_SHADOW) X(USE_GPU_INSTANCING) X(USE_GPU_SKINNING) X(USE_INSTANCING) X(USE_NORMALS) X(USE_OBJECTCOLOR) X(USE_REAL_DEPTH) X(USE_REFLECTION) X(USE_REFRACTION) X(USE_SHADOW) X(USE_SHADOW_PCF) X(USE_SHADOW_SAMPLER) X(USE_FOG) X(WATERTYPE_CLAP) X(WATERTYPE_LAKE) X2(_emptystring, "") X(a_apexPosition) X(a_otherPosition) X(a_retreatPosition) X(a_skinJoints) X(a_skinWeights) X(a_splashPosition) X(a_tangent) X(a_waterInfo) X(ambient) X(baseTex) X(blendTex) X(bloom) X(blurTex2) X(blurTex4) X(blurTex8) X(brightness) X(cameraForward) X(cameraPos) X(canvas2d) X(color) X(colorAdd) X(colorMul) +X(compute_rcas) +X(compute_upscale_fsr) X(debug_line) X(debug_overlay) X(delta) X(depthTex) X(dummy) X(foamTex) X(fogColor) X(fogParams) X(foreground_overlay) X(fxaa) X(grayscaleFactor) X(hdr) X(height) X(instancingTransform) +X(inTex) X(losTex) X(losTex1) X(losTex2) X(losTransform) X(los_interp) X(mapSize) X(maskTex) X(maskTextureTransform) X(minimap) X(minimap_los) X(modelViewMatrix) X(murkiness) X(normalMap) X(normalMap2) X(objectColor) X(overlay_line) X(overlay_solid) +X(outTex) X(particle_add) X(particle_multiply) X(particle_overlay) X(particle_solid) X(particle_subtract) X(playerColor) X(projInvTransform) X(qualityLevel) X(reflectionMap) X(reflectionMatrix) X(refractionMap) X(refractionMatrix) X(renderedTex) X(repeatScale) X2(sans_10, "sans-10"); X(saturation) X(screenSize) X(shadingColor) X(shadowDistance) X(shadowDistances) X(shadowScale) X(shadowTex) X(shadowTransform) X(shadowTransforms) X(sharpness) X(skinBlendMatrices) X(skyBoxRot) X(skyCube) X(sky_simple) X(solid) X(sunColor) X(sunDir) X(terrain_base) X(terrain_blend) X(terrain_decal) X(terrain_solid) X(tex) X(texSize) X(textureTransform) X(time) X(tint) X(transform) X(translation) +X(upscale_bilinear) +X(upscale_nearest) X(viewInvTransform) X(water_high) X(water_simple) X(water_waves) X(waterEffectsTex) X(waterTex) X(waveTex) X(waviness) X(waveParams1) X(waveParams2) X(width) X(windAngle) X(zFar) X(zNear) #undef X #undef X2 Index: ps/trunk/source/renderer/PostprocManager.h =================================================================== --- ps/trunk/source/renderer/PostprocManager.h (revision 28009) +++ ps/trunk/source/renderer/PostprocManager.h (revision 28010) @@ -1,189 +1,231 @@ -/* Copyright (C) 2023 Wildfire Games. +/* Copyright (C) 2024 Wildfire Games. * This file is part of 0 A.D. * * 0 A.D. is free software: you can redistribute it and/or modify * it under the terms of the GNU General Public License as published by * the Free Software Foundation, either version 2 of the License, or * (at your option) any later version. * * 0 A.D. is distributed in the hope that it will be useful, * but WITHOUT ANY WARRANTY; without even the implied warranty of * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the * GNU General Public License for more details. * * You should have received a copy of the GNU General Public License * along with 0 A.D. If not, see . */ #ifndef INCLUDED_POSTPROCMANAGER #define INCLUDED_POSTPROCMANAGER #include "graphics/ShaderTechniquePtr.h" #include "ps/CStr.h" #include "renderer/backend/IFramebuffer.h" #include "renderer/backend/IDeviceCommandContext.h" #include "renderer/backend/IShaderProgram.h" #include "renderer/backend/ITexture.h" #include #include class CPostprocManager { public: CPostprocManager(Renderer::Backend::IDevice* device); ~CPostprocManager(); // Returns true if the the manager can be used. bool IsEnabled() const; // Create all buffers/textures in GPU memory and set default effect. // @note Must be called before using in the renderer. May be called multiple times. void Initialize(); // Update the size of the screen void Resize(); // Returns a list of xml files found in shaders/effects/postproc. static std::vector GetPostEffects(); // Returns the name of the current effect. const CStrW& GetPostEffect() const { return m_PostProcEffect; } // Sets the current effect. void SetPostEffect(const CStrW& name); - // Triggers update of shaders and FBO if needed. + // Triggers update of shaders and framebuffers if needed. void UpdateAntiAliasingTechnique(); void UpdateSharpeningTechnique(); void UpdateSharpnessFactor(); + void SetUpscaleTechnique(const CStr& upscaleName); void SetDepthBufferClipPlanes(float nearPlane, float farPlane); // @note CPostprocManager must be initialized first Renderer::Backend::IFramebuffer* PrepareAndGetOutputFramebuffer(); // First renders blur textures, then calls ApplyEffect for each effect pass, // ping-ponging the buffers at each step. // @note CPostprocManager must be initialized first void ApplyPostproc( Renderer::Backend::IDeviceCommandContext* deviceCommandContext); // Blits the final postprocessed texture to the system framebuffer. The system // framebuffer is selected as the output buffer. Should be called before // silhouette rendering. // @note CPostprocManager must be initialized first void BlitOutputFramebuffer( Renderer::Backend::IDeviceCommandContext* deviceCommandContext, Renderer::Backend::IFramebuffer* destination); // Returns true if we render main scene in the MSAA framebuffer. bool IsMultisampleEnabled() const; // Resolves the MSAA framebuffer into the regular one. void ResolveMultisampleFramebuffer( Renderer::Backend::IDeviceCommandContext* deviceCommandContext); private: void CreateMultisampleBuffer(); void DestroyMultisampleBuffer(); + void RecalculateSize(const uint32_t width, const uint32_t height); + + bool ShouldUpscale() const; + bool ShouldDownscale() const; + + void UpscaleTextureByCompute( + Renderer::Backend::IDeviceCommandContext* deviceCommandContext, + CShaderTechnique* shaderTechnique, + Renderer::Backend::ITexture* source, + Renderer::Backend::ITexture* destination); + void UpscaleTextureByFullscreenQuad( + Renderer::Backend::IDeviceCommandContext* deviceCommandContext, + CShaderTechnique* shaderTechnique, + Renderer::Backend::ITexture* source, + Renderer::Backend::IFramebuffer* destination); + + void ApplySharpnessAfterScale( + Renderer::Backend::IDeviceCommandContext* deviceCommandContext, + CShaderTechnique* shaderTechnique, + Renderer::Backend::ITexture* source, + Renderer::Backend::ITexture* destination); + + void DownscaleTextureByCompute( + Renderer::Backend::IDeviceCommandContext* deviceCommandContext, + CShaderTechnique* shaderTechnique, + Renderer::Backend::ITexture* source, + Renderer::Backend::ITexture* destination); + Renderer::Backend::IDevice* m_Device = nullptr; std::unique_ptr m_CaptureFramebuffer; // Two framebuffers, that we flip between at each shader pass. std::unique_ptr m_PingFramebuffer, m_PongFramebuffer; // Unique color textures for the framebuffers. std::unique_ptr m_ColorTex1, m_ColorTex2; + std::unique_ptr + m_UnscaledTexture1, m_UnscaledTexture2; + std::unique_ptr + m_UnscaledFramebuffer1, m_UnscaledFramebuffer2; + float m_Scale = 1.0f; + // The framebuffers share a depth/stencil texture. std::unique_ptr m_DepthTex; float m_NearPlane, m_FarPlane; // A framebuffer and textures x2 for each blur level we render. struct BlurScale { struct Step { std::unique_ptr framebuffer; std::unique_ptr texture; }; std::array steps; }; std::array m_BlurScales; // Indicates which of the ping-pong buffers is used for reading and which for drawing. bool m_WhichBuffer; Renderer::Backend::IVertexInputLayout* m_VertexInputLayout = nullptr; // The name and shader technique we are using. "default" name means no technique is used // (i.e. while we do allocate the buffers, no effects are rendered). CStrW m_PostProcEffect; CShaderTechniquePtr m_PostProcTech; CStr m_SharpName; CShaderTechniquePtr m_SharpTech; float m_Sharpness; + CShaderTechniquePtr m_UpscaleTech; + CShaderTechniquePtr m_UpscaleComputeTech; + CShaderTechniquePtr m_DownscaleComputeTech; + // Sharp technique only for FSR upscale. + CShaderTechniquePtr m_RCASComputeTech; + CStr m_AAName; CShaderTechniquePtr m_AATech; bool m_UsingMultisampleBuffer; std::unique_ptr m_MultisampleFramebuffer; std::unique_ptr m_MultisampleColorTex, m_MultisampleDepthTex; uint32_t m_MultisampleCount; std::vector m_AllowedSampleCounts; // The current screen dimensions in pixels. - int m_Width, m_Height; + uint32_t m_Width, m_Height; + uint32_t m_UnscaledWidth, m_UnscaledHeight; // Is the postproc manager initialized? Buffers created? Default effect loaded? bool m_IsInitialized; // Creates blur textures at various scales, for bloom, DOF, etc. void ApplyBlur( Renderer::Backend::IDeviceCommandContext* deviceCommandContext); // High quality GPU image scaling to half size. outTex must have exactly half the size // of inTex. inWidth and inHeight are the dimensions of inTex in texels. void ApplyBlurDownscale2x( Renderer::Backend::IDeviceCommandContext* deviceCommandContext, Renderer::Backend::IFramebuffer* framebuffer, Renderer::Backend::ITexture* inTex, int inWidth, int inHeight); // GPU-based Gaussian blur in two passes. inOutTex contains the input image and will be filled // with the blurred image. tempTex must have the same size as inOutTex. // inWidth and inHeight are the dimensions of the images in texels. void ApplyBlurGauss( Renderer::Backend::IDeviceCommandContext* deviceCommandContext, Renderer::Backend::ITexture* inTex, Renderer::Backend::ITexture* tempTex, Renderer::Backend::IFramebuffer* tempFramebuffer, Renderer::Backend::IFramebuffer* outFramebuffer, int inWidth, int inHeight); // Applies a pass of a given effect to the entire current framebuffer. The shader is // provided with a number of general-purpose variables, including the rendered screen so far, // the depth buffer, a number of blur textures, the screen size, the zNear/zFar planes and // some other parameters used by the optional bloom/HDR pass. void ApplyEffect( Renderer::Backend::IDeviceCommandContext* deviceCommandContext, const CShaderTechniquePtr& shaderTech, int pass); // Delete all allocated buffers/textures from GPU memory. void Cleanup(); // Delete existing buffers/textures and create them again, using a new screen size if needed. // (the textures are also attached to the framebuffers) void RecreateBuffers(); }; #endif // INCLUDED_POSTPROCMANAGER Index: ps/trunk/source/renderer/RenderingOptions.cpp =================================================================== --- ps/trunk/source/renderer/RenderingOptions.cpp (revision 28009) +++ ps/trunk/source/renderer/RenderingOptions.cpp (revision 28010) @@ -1,304 +1,318 @@ -/* Copyright (C) 2023 Wildfire Games. +/* Copyright (C) 2024 Wildfire Games. * This file is part of 0 A.D. * * 0 A.D. is free software: you can redistribute it and/or modify * it under the terms of the GNU General Public License as published by * the Free Software Foundation, either version 2 of the License, or * (at your option) any later version. * * 0 A.D. is distributed in the hope that it will be useful, * but WITHOUT ANY WARRANTY; without even the implied warranty of * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the * GNU General Public License for more details. * * You should have received a copy of the GNU General Public License * along with 0 A.D. If not, see . */ #include "precompiled.h" #include "RenderingOptions.h" #include "graphics/TextureManager.h" #include "ps/CLogger.h" #include "ps/ConfigDB.h" #include "ps/CStr.h" #include "ps/CStrInternStatic.h" #include "ps/VideoMode.h" #include "renderer/backend/IDevice.h" #include "renderer/Renderer.h" #include "renderer/PostprocManager.h" #include "renderer/SceneRenderer.h" #include "renderer/ShadowMap.h" #include "renderer/WaterManager.h" CRenderingOptions g_RenderingOptions; class CRenderingOptions::ConfigHooks { public: std::vector::iterator begin() { return hooks.begin(); } std::vector::iterator end() { return hooks.end(); } template void Setup(CStr8 name, T& variable) { hooks.emplace_back(g_ConfigDB.RegisterHookAndCall(name, [name, &variable]() { CFG_GET_VAL(name, variable); })); } void Setup(CStr8 name, std::function hook) { hooks.emplace_back(g_ConfigDB.RegisterHookAndCall(name, hook)); } void clear() { hooks.clear(); } private: std::vector hooks; }; RenderPath RenderPathEnum::FromString(const CStr8& name) { if (name == "default") return DEFAULT; if (name == "fixed") return FIXED; if (name == "shader") return SHADER; LOGWARNING("Unknown render path %s", name.c_str()); return DEFAULT; } CStr8 RenderPathEnum::ToString(RenderPath path) { switch (path) { case RenderPath::DEFAULT: return "default"; case RenderPath::FIXED: return "fixed"; case RenderPath::SHADER: return "shader"; } return "default"; // Silence warning about reaching end of non-void function. } RenderDebugMode RenderDebugModeEnum::FromString(const CStr8& name) { if (name == str_RENDER_DEBUG_MODE_NONE.c_str()) return RenderDebugMode::NONE; if (name == str_RENDER_DEBUG_MODE_AO.c_str()) return RenderDebugMode::AO; if (name == str_RENDER_DEBUG_MODE_ALPHA.c_str()) return RenderDebugMode::ALPHA; if (name == str_RENDER_DEBUG_MODE_CUSTOM.c_str()) return RenderDebugMode::CUSTOM; LOGWARNING("Unknown render debug mode %s", name.c_str()); return RenderDebugMode::NONE; } CStrIntern RenderDebugModeEnum::ToString(RenderDebugMode mode) { switch (mode) { case RenderDebugMode::AO: return str_RENDER_DEBUG_MODE_AO; case RenderDebugMode::ALPHA: return str_RENDER_DEBUG_MODE_ALPHA; case RenderDebugMode::CUSTOM: return str_RENDER_DEBUG_MODE_CUSTOM; default: break; } return str_RENDER_DEBUG_MODE_NONE; } CRenderingOptions::CRenderingOptions() : m_ConfigHooks(new ConfigHooks()) { m_RenderPath = RenderPath::DEFAULT; m_Shadows = false; m_WaterEffects = false; m_WaterFancyEffects = false; m_WaterRealDepth = false; m_WaterRefraction = false; m_WaterReflection = false; m_ShadowAlphaFix = false; m_ShadowPCF = false; m_Particles = false; m_Silhouettes = false; m_Fog = false; m_GPUSkinning = false; m_SmoothLOS = false; m_PostProc = false; m_DisplayFrustum = false; m_DisplayShadowsFrustum = false; m_RenderActors = true; } CRenderingOptions::~CRenderingOptions() { ClearHooks(); } void CRenderingOptions::ReadConfigAndSetupHooks() { m_ConfigHooks->Setup("renderpath", [this]() { CStr renderPath; CFG_GET_VAL("renderpath", renderPath); SetRenderPath(RenderPathEnum::FromString(renderPath)); }); m_ConfigHooks->Setup("shadowquality", []() { if (CRenderer::IsInitialised()) g_Renderer.GetSceneRenderer().GetShadowMap().RecreateTexture(); }); m_ConfigHooks->Setup("shadowscascadecount", []() { if (CRenderer::IsInitialised()) { g_Renderer.GetSceneRenderer().GetShadowMap().RecreateTexture(); g_Renderer.MakeShadersDirty(); } }); m_ConfigHooks->Setup("shadowscovermap", []() { if (CRenderer::IsInitialised()) { g_Renderer.GetSceneRenderer().GetShadowMap().RecreateTexture(); g_Renderer.MakeShadersDirty(); } }); m_ConfigHooks->Setup("shadowscutoffdistance", []() { if (CRenderer::IsInitialised()) g_Renderer.GetSceneRenderer().GetShadowMap().RecreateTexture(); }); m_ConfigHooks->Setup("shadows", [this]() { bool enabled; CFG_GET_VAL("shadows", enabled); SetShadows(enabled); }); m_ConfigHooks->Setup("shadowpcf", [this]() { bool enabled; CFG_GET_VAL("shadowpcf", enabled); SetShadowPCF(enabled); }); m_ConfigHooks->Setup("postproc", m_PostProc); m_ConfigHooks->Setup("antialiasing", []() { if (CRenderer::IsInitialised()) g_Renderer.GetPostprocManager().UpdateAntiAliasingTechnique(); }); m_ConfigHooks->Setup("sharpness", []() { if (CRenderer::IsInitialised()) g_Renderer.GetPostprocManager().UpdateSharpnessFactor(); }); m_ConfigHooks->Setup("sharpening", []() { if (CRenderer::IsInitialised()) g_Renderer.GetPostprocManager().UpdateSharpeningTechnique(); }); + m_ConfigHooks->Setup("renderer.scale", []() + { + if (CRenderer::IsInitialised()) + g_Renderer.GetPostprocManager().Resize(); + }); + + m_ConfigHooks->Setup("renderer.upscale.technique", []() + { + CStr upscaleName; + CFG_GET_VAL("renderer.upscale.technique", upscaleName); + if (CRenderer::IsInitialised()) + g_Renderer.GetPostprocManager().SetUpscaleTechnique(upscaleName); + }); + m_ConfigHooks->Setup("smoothlos", m_SmoothLOS); m_ConfigHooks->Setup("watereffects", [this]() { bool enabled; CFG_GET_VAL("watereffects", enabled); SetWaterEffects(enabled); if (CRenderer::IsInitialised()) g_Renderer.GetSceneRenderer().GetWaterManager().RecreateOrLoadTexturesIfNeeded(); }); m_ConfigHooks->Setup("waterfancyeffects", [this]() { bool enabled; CFG_GET_VAL("waterfancyeffects", enabled); SetWaterFancyEffects(enabled); if (CRenderer::IsInitialised()) g_Renderer.GetSceneRenderer().GetWaterManager().RecreateOrLoadTexturesIfNeeded(); }); m_ConfigHooks->Setup("waterrealdepth", m_WaterRealDepth); m_ConfigHooks->Setup("waterrefraction", [this]() { bool enabled; CFG_GET_VAL("waterrefraction", enabled); SetWaterRefraction(enabled); if (CRenderer::IsInitialised()) g_Renderer.GetSceneRenderer().GetWaterManager().RecreateOrLoadTexturesIfNeeded(); }); m_ConfigHooks->Setup("waterreflection", [this]() { bool enabled; CFG_GET_VAL("waterreflection", enabled); SetWaterReflection(enabled); if (CRenderer::IsInitialised()) g_Renderer.GetSceneRenderer().GetWaterManager().RecreateOrLoadTexturesIfNeeded(); }); m_ConfigHooks->Setup("particles", m_Particles); m_ConfigHooks->Setup("fog", [this]() { bool enabled; CFG_GET_VAL("fog", enabled); SetFog(enabled); }); m_ConfigHooks->Setup("silhouettes", m_Silhouettes); m_ConfigHooks->Setup("gpuskinning", [this]() { bool enabled; CFG_GET_VAL("gpuskinning", enabled); if (enabled) { if (g_VideoMode.GetBackendDevice()->GetBackend() == Renderer::Backend::Backend::GL_ARB) LOGWARNING("GPUSkinning has been disabled, because it is not supported with ARB shaders."); else if (g_VideoMode.GetBackendDevice()->GetBackend() == Renderer::Backend::Backend::VULKAN) LOGWARNING("GPUSkinning has been disabled, because it is not supported for Vulkan backend yet."); else m_GPUSkinning = true; } }); m_ConfigHooks->Setup("renderactors", m_RenderActors); m_ConfigHooks->Setup("textures.quality", []() { if (CRenderer::IsInitialised()) g_Renderer.GetTextureManager().OnQualityChanged(); }); m_ConfigHooks->Setup("textures.maxanisotropy", []() { if (CRenderer::IsInitialised()) g_Renderer.GetTextureManager().OnQualityChanged(); }); } void CRenderingOptions::ClearHooks() { m_ConfigHooks->clear(); } void CRenderingOptions::SetShadows(bool value) { m_Shadows = value; if (CRenderer::IsInitialised()) g_Renderer.MakeShadersDirty(); } void CRenderingOptions::SetShadowPCF(bool value) { m_ShadowPCF = value; if (CRenderer::IsInitialised()) g_Renderer.MakeShadersDirty(); } void CRenderingOptions::SetFog(bool value) { m_Fog = value; if (CRenderer::IsInitialised()) g_Renderer.MakeShadersDirty(); } void CRenderingOptions::SetRenderPath(RenderPath value) { m_RenderPath = value; if (CRenderer::IsInitialised()) g_Renderer.SetRenderPath(m_RenderPath); } void CRenderingOptions::SetRenderDebugMode(RenderDebugMode value) { m_RenderDebugMode = value; if (CRenderer::IsInitialised()) g_Renderer.MakeShadersDirty(); } Index: ps/trunk/source/renderer/backend/ITexture.h =================================================================== --- ps/trunk/source/renderer/backend/ITexture.h (revision 28009) +++ ps/trunk/source/renderer/backend/ITexture.h (revision 28010) @@ -1,67 +1,68 @@ -/* Copyright (C) 2022 Wildfire Games. +/* Copyright (C) 2024 Wildfire Games. * This file is part of 0 A.D. * * 0 A.D. is free software: you can redistribute it and/or modify * it under the terms of the GNU General Public License as published by * the Free Software Foundation, either version 2 of the License, or * (at your option) any later version. * * 0 A.D. is distributed in the hope that it will be useful, * but WITHOUT ANY WARRANTY; without even the implied warranty of * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the * GNU General Public License for more details. * * You should have received a copy of the GNU General Public License * along with 0 A.D. If not, see . */ #ifndef INCLUDED_RENDERER_BACKEND_ITEXTURE #define INCLUDED_RENDERER_BACKEND_ITEXTURE #include "renderer/backend/Format.h" #include "renderer/backend/IDeviceObject.h" #include "renderer/backend/Sampler.h" #include namespace Renderer { namespace Backend { class ITexture : public IDeviceObject { public: enum class Type { TEXTURE_2D, TEXTURE_2D_MULTISAMPLE, TEXTURE_CUBE }; // Using a struct instead of a enum allows using the same syntax while // avoiding adding operator overrides and additional checks on casts. struct Usage { static constexpr uint32_t TRANSFER_SRC = 1u << 0u; static constexpr uint32_t TRANSFER_DST = 1u << 1u; static constexpr uint32_t SAMPLED = 1u << 2u; static constexpr uint32_t COLOR_ATTACHMENT = 1u << 3u; static constexpr uint32_t DEPTH_STENCIL_ATTACHMENT = 1u << 4u; + static constexpr uint32_t STORAGE = 1u << 5u; }; virtual Type GetType() const = 0; virtual uint32_t GetUsage() const = 0; virtual Format GetFormat() const = 0; virtual uint32_t GetWidth() const = 0; virtual uint32_t GetHeight() const = 0; virtual uint32_t GetMIPLevelCount() const = 0; }; } // namespace Backend } // namespace Renderer #endif // INCLUDED_RENDERER_BACKEND_ITEXTURE Index: ps/trunk/source/renderer/backend/dummy/Device.h =================================================================== --- ps/trunk/source/renderer/backend/dummy/Device.h (revision 28009) +++ ps/trunk/source/renderer/backend/dummy/Device.h (revision 28010) @@ -1,121 +1,124 @@ -/* Copyright (C) 2023 Wildfire Games. +/* Copyright (C) 2024 Wildfire Games. * This file is part of 0 A.D. * * 0 A.D. is free software: you can redistribute it and/or modify * it under the terms of the GNU General Public License as published by * the Free Software Foundation, either version 2 of the License, or * (at your option) any later version. * * 0 A.D. is distributed in the hope that it will be useful, * but WITHOUT ANY WARRANTY; without even the implied warranty of * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the * GNU General Public License for more details. * * You should have received a copy of the GNU General Public License * along with 0 A.D. If not, see . */ #ifndef INCLUDED_RENDERER_BACKEND_DUMMY_DEVICE #define INCLUDED_RENDERER_BACKEND_DUMMY_DEVICE #include "renderer/backend/dummy/DeviceForward.h" #include "renderer/backend/IDevice.h" #include #include #include class CShaderDefines; namespace Renderer { namespace Backend { namespace Dummy { class CDeviceCommandContext; class CDevice : public IDevice { public: CDevice(); ~CDevice() override; Backend GetBackend() const override { return Backend::DUMMY; } const std::string& GetName() const override { return m_Name; } const std::string& GetVersion() const override { return m_Version; } const std::string& GetDriverInformation() const override { return m_DriverInformation; } const std::vector& GetExtensions() const override { return m_Extensions; } void Report(const ScriptRequest& rq, JS::HandleValue settings) override; std::unique_ptr CreateCommandContext() override; std::unique_ptr CreateGraphicsPipelineState( const SGraphicsPipelineStateDesc& pipelineStateDesc) override; + std::unique_ptr CreateComputePipelineState( + const SComputePipelineStateDesc& pipelineStateDesc) override; + std::unique_ptr CreateVertexInputLayout( const PS::span attributes) override; std::unique_ptr CreateTexture( const char* name, const ITexture::Type type, const uint32_t usage, const Format format, const uint32_t width, const uint32_t height, const Sampler::Desc& defaultSamplerDesc, const uint32_t MIPLevelCount, const uint32_t sampleCount) override; std::unique_ptr CreateTexture2D( const char* name, const uint32_t usage, const Format format, const uint32_t width, const uint32_t height, const Sampler::Desc& defaultSamplerDesc, const uint32_t MIPLevelCount = 1, const uint32_t sampleCount = 1) override; std::unique_ptr CreateFramebuffer( const char* name, SColorAttachment* colorAttachment, SDepthStencilAttachment* depthStencilAttachment) override; std::unique_ptr CreateBuffer( const char* name, const IBuffer::Type type, const uint32_t size, const bool dynamic) override; std::unique_ptr CreateShaderProgram( const CStr& name, const CShaderDefines& defines) override; bool AcquireNextBackbuffer() override; IFramebuffer* GetCurrentBackbuffer( const AttachmentLoadOp, const AttachmentStoreOp, const AttachmentLoadOp, const AttachmentStoreOp) override; void Present() override; void OnWindowResize(const uint32_t width, const uint32_t height) override; bool IsTextureFormatSupported(const Format format) const override; bool IsFramebufferFormatSupported(const Format format) const override; Format GetPreferredDepthStencilFormat( const uint32_t usage, const bool depth, const bool stencil) const override; const Capabilities& GetCapabilities() const override { return m_Capabilities; } protected: std::string m_Name; std::string m_Version; std::string m_DriverInformation; std::vector m_Extensions; std::unique_ptr m_Backbuffer; Capabilities m_Capabilities{}; }; } // namespace Dummy } // namespace Backend } // namespace Renderer #endif // INCLUDED_RENDERER_BACKEND_DUMMY_DEVICE Index: ps/trunk/source/renderer/backend/dummy/PipelineState.cpp =================================================================== --- ps/trunk/source/renderer/backend/dummy/PipelineState.cpp (revision 28009) +++ ps/trunk/source/renderer/backend/dummy/PipelineState.cpp (revision 28010) @@ -1,52 +1,67 @@ -/* Copyright (C) 2022 Wildfire Games. +/* Copyright (C) 2024 Wildfire Games. * This file is part of 0 A.D. * * 0 A.D. is free software: you can redistribute it and/or modify * it under the terms of the GNU General Public License as published by * the Free Software Foundation, either version 2 of the License, or * (at your option) any later version. * * 0 A.D. is distributed in the hope that it will be useful, * but WITHOUT ANY WARRANTY; without even the implied warranty of * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the * GNU General Public License for more details. * * You should have received a copy of the GNU General Public License * along with 0 A.D. If not, see . */ #include "precompiled.h" #include "PipelineState.h" #include "renderer/backend/dummy/Device.h" namespace Renderer { namespace Backend { namespace Dummy { // static std::unique_ptr CGraphicsPipelineState::Create( CDevice* device, const SGraphicsPipelineStateDesc& desc) { std::unique_ptr pipelineState{new CGraphicsPipelineState()}; pipelineState->m_Device = device; pipelineState->m_Desc = desc; return pipelineState; } IDevice* CGraphicsPipelineState::GetDevice() { return m_Device; } +// static +std::unique_ptr CComputePipelineState::Create( + CDevice* device, const SComputePipelineStateDesc& desc) +{ + std::unique_ptr pipelineState{new CComputePipelineState()}; + pipelineState->m_Device = device; + pipelineState->m_Desc = desc; + return pipelineState; +} + +IDevice* CComputePipelineState::GetDevice() +{ + return m_Device; +} + } // namespace Dummy } // namespace Backend } // namespace Renderer Index: ps/trunk/source/renderer/backend/gl/Device.h =================================================================== --- ps/trunk/source/renderer/backend/gl/Device.h (revision 28009) +++ ps/trunk/source/renderer/backend/gl/Device.h (revision 28010) @@ -1,165 +1,168 @@ -/* Copyright (C) 2023 Wildfire Games. +/* Copyright (C) 2024 Wildfire Games. * This file is part of 0 A.D. * * 0 A.D. is free software: you can redistribute it and/or modify * it under the terms of the GNU General Public License as published by * the Free Software Foundation, either version 2 of the License, or * (at your option) any later version. * * 0 A.D. is distributed in the hope that it will be useful, * but WITHOUT ANY WARRANTY; without even the implied warranty of * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the * GNU General Public License for more details. * * You should have received a copy of the GNU General Public License * along with 0 A.D. If not, see . */ #ifndef INCLUDED_RENDERER_BACKEND_GL_DEVICE #define INCLUDED_RENDERER_BACKEND_GL_DEVICE #include "renderer/backend/Format.h" #include "renderer/backend/gl/Buffer.h" #include "renderer/backend/gl/DeviceForward.h" #include "renderer/backend/gl/Framebuffer.h" #include "renderer/backend/gl/ShaderProgram.h" #include "renderer/backend/gl/Texture.h" #include "renderer/backend/IDevice.h" #include "scriptinterface/ScriptForward.h" #include #include #include #include #include typedef struct SDL_Window SDL_Window; typedef void* SDL_GLContext; namespace Renderer { namespace Backend { namespace GL { class CDeviceCommandContext; class CDevice final : public IDevice { public: ~CDevice() override; /** * Creates the GL device and the GL context for the window if it presents. */ static std::unique_ptr Create(SDL_Window* window, const bool arb); Backend GetBackend() const override { return m_ARB ? Backend::GL_ARB : Backend::GL; } const std::string& GetName() const override { return m_Name; } const std::string& GetVersion() const override { return m_Version; } const std::string& GetDriverInformation() const override { return m_DriverInformation; } const std::vector& GetExtensions() const override { return m_Extensions; } void Report(const ScriptRequest& rq, JS::HandleValue settings) override; std::unique_ptr CreateCommandContext() override; std::unique_ptr CreateGraphicsPipelineState( const SGraphicsPipelineStateDesc& pipelineStateDesc) override; + std::unique_ptr CreateComputePipelineState( + const SComputePipelineStateDesc& pipelineStateDesc) override; + std::unique_ptr CreateVertexInputLayout( const PS::span attributes) override; CDeviceCommandContext* GetActiveCommandContext() { return m_ActiveCommandContext; } std::unique_ptr CreateTexture( const char* name, const ITexture::Type type, const uint32_t usage, const Format format, const uint32_t width, const uint32_t height, const Sampler::Desc& defaultSamplerDesc, const uint32_t MIPLevelCount, const uint32_t sampleCount) override; std::unique_ptr CreateTexture2D( const char* name, const uint32_t usage, const Format format, const uint32_t width, const uint32_t height, const Sampler::Desc& defaultSamplerDesc, const uint32_t MIPLevelCount = 1, const uint32_t sampleCount = 1) override; std::unique_ptr CreateFramebuffer( const char* name, SColorAttachment* colorAttachment, SDepthStencilAttachment* depthStencilAttachment) override; std::unique_ptr CreateBuffer( const char* name, const IBuffer::Type type, const uint32_t size, const bool dynamic) override; std::unique_ptr CreateShaderProgram( const CStr& name, const CShaderDefines& defines) override; bool AcquireNextBackbuffer() override; IFramebuffer* GetCurrentBackbuffer( const AttachmentLoadOp colorAttachmentLoadOp, const AttachmentStoreOp colorAttachmentStoreOp, const AttachmentLoadOp depthStencilAttachmentLoadOp, const AttachmentStoreOp depthStencilAttachmentStoreOp) override; void Present() override; void OnWindowResize(const uint32_t width, const uint32_t height) override; bool UseFramebufferInvalidating() const { return m_UseFramebufferInvalidating; } bool IsTextureFormatSupported(const Format format) const override; bool IsFramebufferFormatSupported(const Format format) const override; Format GetPreferredDepthStencilFormat( const uint32_t usage, const bool depth, const bool stencil) const override; const Capabilities& GetCapabilities() const override { return m_Capabilities; } private: CDevice(); SDL_Window* m_Window = nullptr; SDL_GLContext m_Context = nullptr; int m_SurfaceDrawableWidth = 0, m_SurfaceDrawableHeight = 0; bool m_ARB = false; std::string m_Name; std::string m_Version; std::string m_DriverInformation; std::vector m_Extensions; // GL can have the only one command context at once. // TODO: remove as soon as we have no GL code outside backend, currently // it's used only as a helper for transition. CDeviceCommandContext* m_ActiveCommandContext = nullptr; using BackbufferKey = std::tuple< AttachmentLoadOp, AttachmentStoreOp, AttachmentLoadOp, AttachmentStoreOp>; struct BackbufferKeyHash { size_t operator()(const BackbufferKey& key) const; }; // We use std::unordered_map to avoid storing sizes of Attachment*Op // enumerations. If it becomes a performance issue we'll replace it // by an array. std::unordered_map< BackbufferKey, std::unique_ptr, BackbufferKeyHash> m_Backbuffers; bool m_BackbufferAcquired = false; bool m_UseFramebufferInvalidating = false; Capabilities m_Capabilities{}; }; } // namespace GL } // namespace Backend } // namespace Renderer #endif // INCLUDED_RENDERER_BACKEND_GL_DEVICE Index: ps/trunk/binaries/data/mods/mod/shaders/glsl/ffx_a.h =================================================================== --- ps/trunk/binaries/data/mods/mod/shaders/glsl/ffx_a.h (nonexistent) +++ ps/trunk/binaries/data/mods/mod/shaders/glsl/ffx_a.h (revision 28010) @@ -0,0 +1,2656 @@ +//============================================================================================================================== +// +// [A] SHADER PORTABILITY 1.20210629 +// +//============================================================================================================================== +// FidelityFX Super Resolution Sample +// +// Copyright (c) 2021 Advanced Micro Devices, Inc. All rights reserved. +// Permission is hereby granted, free of charge, to any person obtaining a copy +// of this software and associated documentation files(the "Software"), to deal +// in the Software without restriction, including without limitation the rights +// to use, copy, modify, merge, publish, distribute, sublicense, and / or sell +// copies of the Software, and to permit persons to whom the Software is +// furnished to do so, subject to the following conditions : +// The above copyright notice and this permission notice shall be included in +// all copies or substantial portions of the Software. +// THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR +// IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, +// FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.IN NO EVENT SHALL THE +// AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER +// LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, +// OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN +// THE SOFTWARE. +//------------------------------------------------------------------------------------------------------------------------------ +// MIT LICENSE +// =========== +// Copyright (c) 2014 Michal Drobot (for concepts used in "FLOAT APPROXIMATIONS"). +// ----------- +// Permission is hereby granted, free of charge, to any person obtaining a copy of this software and associated documentation +// files (the "Software"), to deal in the Software without restriction, including without limitation the rights to use, copy, +// modify, merge, publish, distribute, sublicense, and/or sell copies of the Software, and to permit persons to whom the +// Software is furnished to do so, subject to the following conditions: +// ----------- +// The above copyright notice and this permission notice shall be included in all copies or substantial portions of the +// Software. +// ----------- +// THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE +// WARRANTIES OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR +// COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, +// ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE. +//------------------------------------------------------------------------------------------------------------------------------ +// ABOUT +// ===== +// Common central point for high-level shading language and C portability for various shader headers. +//------------------------------------------------------------------------------------------------------------------------------ +// DEFINES +// ======= +// A_CPU ..... Include the CPU related code. +// A_GPU ..... Include the GPU related code. +// A_GLSL .... Using GLSL. +// A_HLSL .... Using HLSL. +// A_HLSL_6_2 Using HLSL 6.2 with new 'uint16_t' and related types (requires '-enable-16bit-types'). +// A_NO_16_BIT_CAST Don't use instructions that are not availabe in SPIR-V (needed for running A_HLSL_6_2 on Vulkan) +// A_GCC ..... Using a GCC compatible compiler (else assume MSVC compatible compiler by default). +// ======= +// A_BYTE .... Support 8-bit integer. +// A_HALF .... Support 16-bit integer and floating point. +// A_LONG .... Support 64-bit integer. +// A_DUBL .... Support 64-bit floating point. +// ======= +// A_WAVE .... Support wave-wide operations. +//------------------------------------------------------------------------------------------------------------------------------ +// To get #include "ffx_a.h" working in GLSL use '#extension GL_GOOGLE_include_directive:require'. +//------------------------------------------------------------------------------------------------------------------------------ +// SIMPLIFIED TYPE SYSTEM +// ====================== +// - All ints will be unsigned with exception of when signed is required. +// - Type naming simplified and shortened "A<#components>", +// - H = 16-bit float (half) +// - F = 32-bit float (float) +// - D = 64-bit float (double) +// - P = 1-bit integer (predicate, not using bool because 'B' is used for byte) +// - B = 8-bit integer (byte) +// - W = 16-bit integer (word) +// - U = 32-bit integer (unsigned) +// - L = 64-bit integer (long) +// - Using "AS<#components>" for signed when required. +//------------------------------------------------------------------------------------------------------------------------------ +// TODO +// ==== +// - Make sure 'ALerp*(a,b,m)' does 'b*m+(-a*m+a)' (2 ops). +//------------------------------------------------------------------------------------------------------------------------------ +// CHANGE LOG +// ========== +// 20200914 - Expanded wave ops and prx code. +// 20200713 - Added [ZOL] section, fixed serious bugs in sRGB and Rec.709 color conversion code, etc. +//============================================================================================================================== +//////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////// +//////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////// +//_____________________________________________________________/\_______________________________________________________________ +//============================================================================================================================== +// COMMON +//============================================================================================================================== +#define A_2PI 6.28318530718 +//////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////// +//////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////// +//////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////// +//////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////// +//////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////// +//////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////// +//_____________________________________________________________/\_______________________________________________________________ +//============================================================================================================================== +// +// +// CPU +// +// +//============================================================================================================================== +#ifdef A_CPU + // Supporting user defined overrides. + #ifndef A_RESTRICT + #define A_RESTRICT __restrict + #endif +//------------------------------------------------------------------------------------------------------------------------------ + #ifndef A_STATIC + #define A_STATIC static + #endif +//------------------------------------------------------------------------------------------------------------------------------ + // Same types across CPU and GPU. + // Predicate uses 32-bit integer (C friendly bool). + typedef uint32_t AP1; + typedef float AF1; + typedef double AD1; + typedef uint8_t AB1; + typedef uint16_t AW1; + typedef uint32_t AU1; + typedef uint64_t AL1; + typedef int8_t ASB1; + typedef int16_t ASW1; + typedef int32_t ASU1; + typedef int64_t ASL1; +//------------------------------------------------------------------------------------------------------------------------------ + #define AD1_(a) ((AD1)(a)) + #define AF1_(a) ((AF1)(a)) + #define AL1_(a) ((AL1)(a)) + #define AU1_(a) ((AU1)(a)) +//------------------------------------------------------------------------------------------------------------------------------ + #define ASL1_(a) ((ASL1)(a)) + #define ASU1_(a) ((ASU1)(a)) +//------------------------------------------------------------------------------------------------------------------------------ + A_STATIC AU1 AU1_AF1(AF1 a){union{AF1 f;AU1 u;}bits;bits.f=a;return bits.u;} +//------------------------------------------------------------------------------------------------------------------------------ + #define A_TRUE 1 + #define A_FALSE 0 +//////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////// +//////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////// +//////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////// +//////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////// +//_____________________________________________________________/\_______________________________________________________________ +//============================================================================================================================== +// +// CPU/GPU PORTING +// +//------------------------------------------------------------------------------------------------------------------------------ +// Get CPU and GPU to share all setup code, without duplicate code paths. +// This uses a lower-case prefix for special vector constructs. +// - In C restrict pointers are used. +// - In the shading language, in/inout/out arguments are used. +// This depends on the ability to access a vector value in both languages via array syntax (aka color[2]). +//============================================================================================================================== +//////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////// +//////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////// +//_____________________________________________________________/\_______________________________________________________________ +//============================================================================================================================== +// VECTOR ARGUMENT/RETURN/INITIALIZATION PORTABILITY +//============================================================================================================================== + #define retAD2 AD1 *A_RESTRICT + #define retAD3 AD1 *A_RESTRICT + #define retAD4 AD1 *A_RESTRICT + #define retAF2 AF1 *A_RESTRICT + #define retAF3 AF1 *A_RESTRICT + #define retAF4 AF1 *A_RESTRICT + #define retAL2 AL1 *A_RESTRICT + #define retAL3 AL1 *A_RESTRICT + #define retAL4 AL1 *A_RESTRICT + #define retAU2 AU1 *A_RESTRICT + #define retAU3 AU1 *A_RESTRICT + #define retAU4 AU1 *A_RESTRICT +//------------------------------------------------------------------------------------------------------------------------------ + #define inAD2 AD1 *A_RESTRICT + #define inAD3 AD1 *A_RESTRICT + #define inAD4 AD1 *A_RESTRICT + #define inAF2 AF1 *A_RESTRICT + #define inAF3 AF1 *A_RESTRICT + #define inAF4 AF1 *A_RESTRICT + #define inAL2 AL1 *A_RESTRICT + #define inAL3 AL1 *A_RESTRICT + #define inAL4 AL1 *A_RESTRICT + #define inAU2 AU1 *A_RESTRICT + #define inAU3 AU1 *A_RESTRICT + #define inAU4 AU1 *A_RESTRICT +//------------------------------------------------------------------------------------------------------------------------------ + #define inoutAD2 AD1 *A_RESTRICT + #define inoutAD3 AD1 *A_RESTRICT + #define inoutAD4 AD1 *A_RESTRICT + #define inoutAF2 AF1 *A_RESTRICT + #define inoutAF3 AF1 *A_RESTRICT + #define inoutAF4 AF1 *A_RESTRICT + #define inoutAL2 AL1 *A_RESTRICT + #define inoutAL3 AL1 *A_RESTRICT + #define inoutAL4 AL1 *A_RESTRICT + #define inoutAU2 AU1 *A_RESTRICT + #define inoutAU3 AU1 *A_RESTRICT + #define inoutAU4 AU1 *A_RESTRICT +//------------------------------------------------------------------------------------------------------------------------------ + #define outAD2 AD1 *A_RESTRICT + #define outAD3 AD1 *A_RESTRICT + #define outAD4 AD1 *A_RESTRICT + #define outAF2 AF1 *A_RESTRICT + #define outAF3 AF1 *A_RESTRICT + #define outAF4 AF1 *A_RESTRICT + #define outAL2 AL1 *A_RESTRICT + #define outAL3 AL1 *A_RESTRICT + #define outAL4 AL1 *A_RESTRICT + #define outAU2 AU1 *A_RESTRICT + #define outAU3 AU1 *A_RESTRICT + #define outAU4 AU1 *A_RESTRICT +//------------------------------------------------------------------------------------------------------------------------------ + #define varAD2(x) AD1 x[2] + #define varAD3(x) AD1 x[3] + #define varAD4(x) AD1 x[4] + #define varAF2(x) AF1 x[2] + #define varAF3(x) AF1 x[3] + #define varAF4(x) AF1 x[4] + #define varAL2(x) AL1 x[2] + #define varAL3(x) AL1 x[3] + #define varAL4(x) AL1 x[4] + #define varAU2(x) AU1 x[2] + #define varAU3(x) AU1 x[3] + #define varAU4(x) AU1 x[4] +//------------------------------------------------------------------------------------------------------------------------------ + #define initAD2(x,y) {x,y} + #define initAD3(x,y,z) {x,y,z} + #define initAD4(x,y,z,w) {x,y,z,w} + #define initAF2(x,y) {x,y} + #define initAF3(x,y,z) {x,y,z} + #define initAF4(x,y,z,w) {x,y,z,w} + #define initAL2(x,y) {x,y} + #define initAL3(x,y,z) {x,y,z} + #define initAL4(x,y,z,w) {x,y,z,w} + #define initAU2(x,y) {x,y} + #define initAU3(x,y,z) {x,y,z} + #define initAU4(x,y,z,w) {x,y,z,w} +//////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////// +//////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////// +//_____________________________________________________________/\_______________________________________________________________ +//============================================================================================================================== +// SCALAR RETURN OPS +//------------------------------------------------------------------------------------------------------------------------------ +// TODO +// ==== +// - Replace transcendentals with manual versions. +//============================================================================================================================== + #ifdef A_GCC + A_STATIC AD1 AAbsD1(AD1 a){return __builtin_fabs(a);} + A_STATIC AF1 AAbsF1(AF1 a){return __builtin_fabsf(a);} + A_STATIC AU1 AAbsSU1(AU1 a){return AU1_(__builtin_abs(ASU1_(a)));} + A_STATIC AL1 AAbsSL1(AL1 a){return AL1_(__builtin_llabs(ASL1_(a)));} + #else + A_STATIC AD1 AAbsD1(AD1 a){return fabs(a);} + A_STATIC AF1 AAbsF1(AF1 a){return fabsf(a);} + A_STATIC AU1 AAbsSU1(AU1 a){return AU1_(abs(ASU1_(a)));} + A_STATIC AL1 AAbsSL1(AL1 a){return AL1_(labs((long)ASL1_(a)));} + #endif +//------------------------------------------------------------------------------------------------------------------------------ + #ifdef A_GCC + A_STATIC AD1 ACosD1(AD1 a){return __builtin_cos(a);} + A_STATIC AF1 ACosF1(AF1 a){return __builtin_cosf(a);} + #else + A_STATIC AD1 ACosD1(AD1 a){return cos(a);} + A_STATIC AF1 ACosF1(AF1 a){return cosf(a);} + #endif +//------------------------------------------------------------------------------------------------------------------------------ + A_STATIC AD1 ADotD2(inAD2 a,inAD2 b){return a[0]*b[0]+a[1]*b[1];} + A_STATIC AD1 ADotD3(inAD3 a,inAD3 b){return a[0]*b[0]+a[1]*b[1]+a[2]*b[2];} + A_STATIC AD1 ADotD4(inAD4 a,inAD4 b){return a[0]*b[0]+a[1]*b[1]+a[2]*b[2]+a[3]*b[3];} + A_STATIC AF1 ADotF2(inAF2 a,inAF2 b){return a[0]*b[0]+a[1]*b[1];} + A_STATIC AF1 ADotF3(inAF3 a,inAF3 b){return a[0]*b[0]+a[1]*b[1]+a[2]*b[2];} + A_STATIC AF1 ADotF4(inAF4 a,inAF4 b){return a[0]*b[0]+a[1]*b[1]+a[2]*b[2]+a[3]*b[3];} +//------------------------------------------------------------------------------------------------------------------------------ + #ifdef A_GCC + A_STATIC AD1 AExp2D1(AD1 a){return __builtin_exp2(a);} + A_STATIC AF1 AExp2F1(AF1 a){return __builtin_exp2f(a);} + #else + A_STATIC AD1 AExp2D1(AD1 a){return exp2(a);} + A_STATIC AF1 AExp2F1(AF1 a){return exp2f(a);} + #endif +//------------------------------------------------------------------------------------------------------------------------------ + #ifdef A_GCC + A_STATIC AD1 AFloorD1(AD1 a){return __builtin_floor(a);} + A_STATIC AF1 AFloorF1(AF1 a){return __builtin_floorf(a);} + #else + A_STATIC AD1 AFloorD1(AD1 a){return floor(a);} + A_STATIC AF1 AFloorF1(AF1 a){return floorf(a);} + #endif +//------------------------------------------------------------------------------------------------------------------------------ + A_STATIC AD1 ALerpD1(AD1 a,AD1 b,AD1 c){return b*c+(-a*c+a);} + A_STATIC AF1 ALerpF1(AF1 a,AF1 b,AF1 c){return b*c+(-a*c+a);} +//------------------------------------------------------------------------------------------------------------------------------ + #ifdef A_GCC + A_STATIC AD1 ALog2D1(AD1 a){return __builtin_log2(a);} + A_STATIC AF1 ALog2F1(AF1 a){return __builtin_log2f(a);} + #else + A_STATIC AD1 ALog2D1(AD1 a){return log2(a);} + A_STATIC AF1 ALog2F1(AF1 a){return log2f(a);} + #endif +//------------------------------------------------------------------------------------------------------------------------------ + A_STATIC AD1 AMaxD1(AD1 a,AD1 b){return a>b?a:b;} + A_STATIC AF1 AMaxF1(AF1 a,AF1 b){return a>b?a:b;} + A_STATIC AL1 AMaxL1(AL1 a,AL1 b){return a>b?a:b;} + A_STATIC AU1 AMaxU1(AU1 a,AU1 b){return a>b?a:b;} +//------------------------------------------------------------------------------------------------------------------------------ + // These follow the convention that A integer types don't have signage, until they are operated on. + A_STATIC AL1 AMaxSL1(AL1 a,AL1 b){return (ASL1_(a)>ASL1_(b))?a:b;} + A_STATIC AU1 AMaxSU1(AU1 a,AU1 b){return (ASU1_(a)>ASU1_(b))?a:b;} +//------------------------------------------------------------------------------------------------------------------------------ + A_STATIC AD1 AMinD1(AD1 a,AD1 b){return a>ASL1_(b));} + A_STATIC AU1 AShrSU1(AU1 a,AU1 b){return AU1_(ASU1_(a)>>ASU1_(b));} +//------------------------------------------------------------------------------------------------------------------------------ + #ifdef A_GCC + A_STATIC AD1 ASinD1(AD1 a){return __builtin_sin(a);} + A_STATIC AF1 ASinF1(AF1 a){return __builtin_sinf(a);} + #else + A_STATIC AD1 ASinD1(AD1 a){return sin(a);} + A_STATIC AF1 ASinF1(AF1 a){return sinf(a);} + #endif +//------------------------------------------------------------------------------------------------------------------------------ + #ifdef A_GCC + A_STATIC AD1 ASqrtD1(AD1 a){return __builtin_sqrt(a);} + A_STATIC AF1 ASqrtF1(AF1 a){return __builtin_sqrtf(a);} + #else + A_STATIC AD1 ASqrtD1(AD1 a){return sqrt(a);} + A_STATIC AF1 ASqrtF1(AF1 a){return sqrtf(a);} + #endif +//////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////// +//////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////// +//_____________________________________________________________/\_______________________________________________________________ +//============================================================================================================================== +// SCALAR RETURN OPS - DEPENDENT +//============================================================================================================================== + A_STATIC AD1 AClampD1(AD1 x,AD1 n,AD1 m){return AMaxD1(n,AMinD1(x,m));} + A_STATIC AF1 AClampF1(AF1 x,AF1 n,AF1 m){return AMaxF1(n,AMinF1(x,m));} +//------------------------------------------------------------------------------------------------------------------------------ + A_STATIC AD1 AFractD1(AD1 a){return a-AFloorD1(a);} + A_STATIC AF1 AFractF1(AF1 a){return a-AFloorF1(a);} +//------------------------------------------------------------------------------------------------------------------------------ + A_STATIC AD1 APowD1(AD1 a,AD1 b){return AExp2D1(b*ALog2D1(a));} + A_STATIC AF1 APowF1(AF1 a,AF1 b){return AExp2F1(b*ALog2F1(a));} +//------------------------------------------------------------------------------------------------------------------------------ + A_STATIC AD1 ARsqD1(AD1 a){return ARcpD1(ASqrtD1(a));} + A_STATIC AF1 ARsqF1(AF1 a){return ARcpF1(ASqrtF1(a));} +//------------------------------------------------------------------------------------------------------------------------------ + A_STATIC AD1 ASatD1(AD1 a){return AMinD1(1.0,AMaxD1(0.0,a));} + A_STATIC AF1 ASatF1(AF1 a){return AMinF1(1.0f,AMaxF1(0.0f,a));} +//////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////// +//////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////// +//_____________________________________________________________/\_______________________________________________________________ +//============================================================================================================================== +// VECTOR OPS +//------------------------------------------------------------------------------------------------------------------------------ +// These are added as needed for production or prototyping, so not necessarily a complete set. +// They follow a convention of taking in a destination and also returning the destination value to increase utility. +//============================================================================================================================== + A_STATIC retAD2 opAAbsD2(outAD2 d,inAD2 a){d[0]=AAbsD1(a[0]);d[1]=AAbsD1(a[1]);return d;} + A_STATIC retAD3 opAAbsD3(outAD3 d,inAD3 a){d[0]=AAbsD1(a[0]);d[1]=AAbsD1(a[1]);d[2]=AAbsD1(a[2]);return d;} + A_STATIC retAD4 opAAbsD4(outAD4 d,inAD4 a){d[0]=AAbsD1(a[0]);d[1]=AAbsD1(a[1]);d[2]=AAbsD1(a[2]);d[3]=AAbsD1(a[3]);return d;} +//------------------------------------------------------------------------------------------------------------------------------ + A_STATIC retAF2 opAAbsF2(outAF2 d,inAF2 a){d[0]=AAbsF1(a[0]);d[1]=AAbsF1(a[1]);return d;} + A_STATIC retAF3 opAAbsF3(outAF3 d,inAF3 a){d[0]=AAbsF1(a[0]);d[1]=AAbsF1(a[1]);d[2]=AAbsF1(a[2]);return d;} + A_STATIC retAF4 opAAbsF4(outAF4 d,inAF4 a){d[0]=AAbsF1(a[0]);d[1]=AAbsF1(a[1]);d[2]=AAbsF1(a[2]);d[3]=AAbsF1(a[3]);return d;} +//============================================================================================================================== + A_STATIC retAD2 opAAddD2(outAD2 d,inAD2 a,inAD2 b){d[0]=a[0]+b[0];d[1]=a[1]+b[1];return d;} + A_STATIC retAD3 opAAddD3(outAD3 d,inAD3 a,inAD3 b){d[0]=a[0]+b[0];d[1]=a[1]+b[1];d[2]=a[2]+b[2];return d;} + A_STATIC retAD4 opAAddD4(outAD4 d,inAD4 a,inAD4 b){d[0]=a[0]+b[0];d[1]=a[1]+b[1];d[2]=a[2]+b[2];d[3]=a[3]+b[3];return d;} +//------------------------------------------------------------------------------------------------------------------------------ + A_STATIC retAF2 opAAddF2(outAF2 d,inAF2 a,inAF2 b){d[0]=a[0]+b[0];d[1]=a[1]+b[1];return d;} + A_STATIC retAF3 opAAddF3(outAF3 d,inAF3 a,inAF3 b){d[0]=a[0]+b[0];d[1]=a[1]+b[1];d[2]=a[2]+b[2];return d;} + A_STATIC retAF4 opAAddF4(outAF4 d,inAF4 a,inAF4 b){d[0]=a[0]+b[0];d[1]=a[1]+b[1];d[2]=a[2]+b[2];d[3]=a[3]+b[3];return d;} +//============================================================================================================================== + A_STATIC retAD2 opAAddOneD2(outAD2 d,inAD2 a,AD1 b){d[0]=a[0]+b;d[1]=a[1]+b;return d;} + A_STATIC retAD3 opAAddOneD3(outAD3 d,inAD3 a,AD1 b){d[0]=a[0]+b;d[1]=a[1]+b;d[2]=a[2]+b;return d;} + A_STATIC retAD4 opAAddOneD4(outAD4 d,inAD4 a,AD1 b){d[0]=a[0]+b;d[1]=a[1]+b;d[2]=a[2]+b;d[3]=a[3]+b;return d;} +//------------------------------------------------------------------------------------------------------------------------------ + A_STATIC retAF2 opAAddOneF2(outAF2 d,inAF2 a,AF1 b){d[0]=a[0]+b;d[1]=a[1]+b;return d;} + A_STATIC retAF3 opAAddOneF3(outAF3 d,inAF3 a,AF1 b){d[0]=a[0]+b;d[1]=a[1]+b;d[2]=a[2]+b;return d;} + A_STATIC retAF4 opAAddOneF4(outAF4 d,inAF4 a,AF1 b){d[0]=a[0]+b;d[1]=a[1]+b;d[2]=a[2]+b;d[3]=a[3]+b;return d;} +//============================================================================================================================== + A_STATIC retAD2 opACpyD2(outAD2 d,inAD2 a){d[0]=a[0];d[1]=a[1];return d;} + A_STATIC retAD3 opACpyD3(outAD3 d,inAD3 a){d[0]=a[0];d[1]=a[1];d[2]=a[2];return d;} + A_STATIC retAD4 opACpyD4(outAD4 d,inAD4 a){d[0]=a[0];d[1]=a[1];d[2]=a[2];d[3]=a[3];return d;} +//------------------------------------------------------------------------------------------------------------------------------ + A_STATIC retAF2 opACpyF2(outAF2 d,inAF2 a){d[0]=a[0];d[1]=a[1];return d;} + A_STATIC retAF3 opACpyF3(outAF3 d,inAF3 a){d[0]=a[0];d[1]=a[1];d[2]=a[2];return d;} + A_STATIC retAF4 opACpyF4(outAF4 d,inAF4 a){d[0]=a[0];d[1]=a[1];d[2]=a[2];d[3]=a[3];return d;} +//============================================================================================================================== + A_STATIC retAD2 opALerpD2(outAD2 d,inAD2 a,inAD2 b,inAD2 c){d[0]=ALerpD1(a[0],b[0],c[0]);d[1]=ALerpD1(a[1],b[1],c[1]);return d;} + A_STATIC retAD3 opALerpD3(outAD3 d,inAD3 a,inAD3 b,inAD3 c){d[0]=ALerpD1(a[0],b[0],c[0]);d[1]=ALerpD1(a[1],b[1],c[1]);d[2]=ALerpD1(a[2],b[2],c[2]);return d;} + A_STATIC retAD4 opALerpD4(outAD4 d,inAD4 a,inAD4 b,inAD4 c){d[0]=ALerpD1(a[0],b[0],c[0]);d[1]=ALerpD1(a[1],b[1],c[1]);d[2]=ALerpD1(a[2],b[2],c[2]);d[3]=ALerpD1(a[3],b[3],c[3]);return d;} +//------------------------------------------------------------------------------------------------------------------------------ + A_STATIC retAF2 opALerpF2(outAF2 d,inAF2 a,inAF2 b,inAF2 c){d[0]=ALerpF1(a[0],b[0],c[0]);d[1]=ALerpF1(a[1],b[1],c[1]);return d;} + A_STATIC retAF3 opALerpF3(outAF3 d,inAF3 a,inAF3 b,inAF3 c){d[0]=ALerpF1(a[0],b[0],c[0]);d[1]=ALerpF1(a[1],b[1],c[1]);d[2]=ALerpF1(a[2],b[2],c[2]);return d;} + A_STATIC retAF4 opALerpF4(outAF4 d,inAF4 a,inAF4 b,inAF4 c){d[0]=ALerpF1(a[0],b[0],c[0]);d[1]=ALerpF1(a[1],b[1],c[1]);d[2]=ALerpF1(a[2],b[2],c[2]);d[3]=ALerpF1(a[3],b[3],c[3]);return d;} +//============================================================================================================================== + A_STATIC retAD2 opALerpOneD2(outAD2 d,inAD2 a,inAD2 b,AD1 c){d[0]=ALerpD1(a[0],b[0],c);d[1]=ALerpD1(a[1],b[1],c);return d;} + A_STATIC retAD3 opALerpOneD3(outAD3 d,inAD3 a,inAD3 b,AD1 c){d[0]=ALerpD1(a[0],b[0],c);d[1]=ALerpD1(a[1],b[1],c);d[2]=ALerpD1(a[2],b[2],c);return d;} + A_STATIC retAD4 opALerpOneD4(outAD4 d,inAD4 a,inAD4 b,AD1 c){d[0]=ALerpD1(a[0],b[0],c);d[1]=ALerpD1(a[1],b[1],c);d[2]=ALerpD1(a[2],b[2],c);d[3]=ALerpD1(a[3],b[3],c);return d;} +//------------------------------------------------------------------------------------------------------------------------------ + A_STATIC retAF2 opALerpOneF2(outAF2 d,inAF2 a,inAF2 b,AF1 c){d[0]=ALerpF1(a[0],b[0],c);d[1]=ALerpF1(a[1],b[1],c);return d;} + A_STATIC retAF3 opALerpOneF3(outAF3 d,inAF3 a,inAF3 b,AF1 c){d[0]=ALerpF1(a[0],b[0],c);d[1]=ALerpF1(a[1],b[1],c);d[2]=ALerpF1(a[2],b[2],c);return d;} + A_STATIC retAF4 opALerpOneF4(outAF4 d,inAF4 a,inAF4 b,AF1 c){d[0]=ALerpF1(a[0],b[0],c);d[1]=ALerpF1(a[1],b[1],c);d[2]=ALerpF1(a[2],b[2],c);d[3]=ALerpF1(a[3],b[3],c);return d;} +//============================================================================================================================== + A_STATIC retAD2 opAMaxD2(outAD2 d,inAD2 a,inAD2 b){d[0]=AMaxD1(a[0],b[0]);d[1]=AMaxD1(a[1],b[1]);return d;} + A_STATIC retAD3 opAMaxD3(outAD3 d,inAD3 a,inAD3 b){d[0]=AMaxD1(a[0],b[0]);d[1]=AMaxD1(a[1],b[1]);d[2]=AMaxD1(a[2],b[2]);return d;} + A_STATIC retAD4 opAMaxD4(outAD4 d,inAD4 a,inAD4 b){d[0]=AMaxD1(a[0],b[0]);d[1]=AMaxD1(a[1],b[1]);d[2]=AMaxD1(a[2],b[2]);d[3]=AMaxD1(a[3],b[3]);return d;} +//------------------------------------------------------------------------------------------------------------------------------ + A_STATIC retAF2 opAMaxF2(outAF2 d,inAF2 a,inAF2 b){d[0]=AMaxF1(a[0],b[0]);d[1]=AMaxF1(a[1],b[1]);return d;} + A_STATIC retAF3 opAMaxF3(outAF3 d,inAF3 a,inAF3 b){d[0]=AMaxF1(a[0],b[0]);d[1]=AMaxF1(a[1],b[1]);d[2]=AMaxF1(a[2],b[2]);return d;} + A_STATIC retAF4 opAMaxF4(outAF4 d,inAF4 a,inAF4 b){d[0]=AMaxF1(a[0],b[0]);d[1]=AMaxF1(a[1],b[1]);d[2]=AMaxF1(a[2],b[2]);d[3]=AMaxF1(a[3],b[3]);return d;} +//============================================================================================================================== + A_STATIC retAD2 opAMinD2(outAD2 d,inAD2 a,inAD2 b){d[0]=AMinD1(a[0],b[0]);d[1]=AMinD1(a[1],b[1]);return d;} + A_STATIC retAD3 opAMinD3(outAD3 d,inAD3 a,inAD3 b){d[0]=AMinD1(a[0],b[0]);d[1]=AMinD1(a[1],b[1]);d[2]=AMinD1(a[2],b[2]);return d;} + A_STATIC retAD4 opAMinD4(outAD4 d,inAD4 a,inAD4 b){d[0]=AMinD1(a[0],b[0]);d[1]=AMinD1(a[1],b[1]);d[2]=AMinD1(a[2],b[2]);d[3]=AMinD1(a[3],b[3]);return d;} +//------------------------------------------------------------------------------------------------------------------------------ + A_STATIC retAF2 opAMinF2(outAF2 d,inAF2 a,inAF2 b){d[0]=AMinF1(a[0],b[0]);d[1]=AMinF1(a[1],b[1]);return d;} + A_STATIC retAF3 opAMinF3(outAF3 d,inAF3 a,inAF3 b){d[0]=AMinF1(a[0],b[0]);d[1]=AMinF1(a[1],b[1]);d[2]=AMinF1(a[2],b[2]);return d;} + A_STATIC retAF4 opAMinF4(outAF4 d,inAF4 a,inAF4 b){d[0]=AMinF1(a[0],b[0]);d[1]=AMinF1(a[1],b[1]);d[2]=AMinF1(a[2],b[2]);d[3]=AMinF1(a[3],b[3]);return d;} +//============================================================================================================================== + A_STATIC retAD2 opAMulD2(outAD2 d,inAD2 a,inAD2 b){d[0]=a[0]*b[0];d[1]=a[1]*b[1];return d;} + A_STATIC retAD3 opAMulD3(outAD3 d,inAD3 a,inAD3 b){d[0]=a[0]*b[0];d[1]=a[1]*b[1];d[2]=a[2]*b[2];return d;} + A_STATIC retAD4 opAMulD4(outAD4 d,inAD4 a,inAD4 b){d[0]=a[0]*b[0];d[1]=a[1]*b[1];d[2]=a[2]*b[2];d[3]=a[3]*b[3];return d;} +//------------------------------------------------------------------------------------------------------------------------------ + A_STATIC retAF2 opAMulF2(outAF2 d,inAF2 a,inAF2 b){d[0]=a[0]*b[0];d[1]=a[1]*b[1];return d;} + A_STATIC retAF3 opAMulF3(outAF3 d,inAF3 a,inAF3 b){d[0]=a[0]*b[0];d[1]=a[1]*b[1];d[2]=a[2]*b[2];return d;} + A_STATIC retAF4 opAMulF4(outAF4 d,inAF4 a,inAF4 b){d[0]=a[0]*b[0];d[1]=a[1]*b[1];d[2]=a[2]*b[2];d[3]=a[3]*b[3];return d;} +//============================================================================================================================== + A_STATIC retAD2 opAMulOneD2(outAD2 d,inAD2 a,AD1 b){d[0]=a[0]*b;d[1]=a[1]*b;return d;} + A_STATIC retAD3 opAMulOneD3(outAD3 d,inAD3 a,AD1 b){d[0]=a[0]*b;d[1]=a[1]*b;d[2]=a[2]*b;return d;} + A_STATIC retAD4 opAMulOneD4(outAD4 d,inAD4 a,AD1 b){d[0]=a[0]*b;d[1]=a[1]*b;d[2]=a[2]*b;d[3]=a[3]*b;return d;} +//------------------------------------------------------------------------------------------------------------------------------ + A_STATIC retAF2 opAMulOneF2(outAF2 d,inAF2 a,AF1 b){d[0]=a[0]*b;d[1]=a[1]*b;return d;} + A_STATIC retAF3 opAMulOneF3(outAF3 d,inAF3 a,AF1 b){d[0]=a[0]*b;d[1]=a[1]*b;d[2]=a[2]*b;return d;} + A_STATIC retAF4 opAMulOneF4(outAF4 d,inAF4 a,AF1 b){d[0]=a[0]*b;d[1]=a[1]*b;d[2]=a[2]*b;d[3]=a[3]*b;return d;} +//============================================================================================================================== + A_STATIC retAD2 opANegD2(outAD2 d,inAD2 a){d[0]=-a[0];d[1]=-a[1];return d;} + A_STATIC retAD3 opANegD3(outAD3 d,inAD3 a){d[0]=-a[0];d[1]=-a[1];d[2]=-a[2];return d;} + A_STATIC retAD4 opANegD4(outAD4 d,inAD4 a){d[0]=-a[0];d[1]=-a[1];d[2]=-a[2];d[3]=-a[3];return d;} +//------------------------------------------------------------------------------------------------------------------------------ + A_STATIC retAF2 opANegF2(outAF2 d,inAF2 a){d[0]=-a[0];d[1]=-a[1];return d;} + A_STATIC retAF3 opANegF3(outAF3 d,inAF3 a){d[0]=-a[0];d[1]=-a[1];d[2]=-a[2];return d;} + A_STATIC retAF4 opANegF4(outAF4 d,inAF4 a){d[0]=-a[0];d[1]=-a[1];d[2]=-a[2];d[3]=-a[3];return d;} +//============================================================================================================================== + A_STATIC retAD2 opARcpD2(outAD2 d,inAD2 a){d[0]=ARcpD1(a[0]);d[1]=ARcpD1(a[1]);return d;} + A_STATIC retAD3 opARcpD3(outAD3 d,inAD3 a){d[0]=ARcpD1(a[0]);d[1]=ARcpD1(a[1]);d[2]=ARcpD1(a[2]);return d;} + A_STATIC retAD4 opARcpD4(outAD4 d,inAD4 a){d[0]=ARcpD1(a[0]);d[1]=ARcpD1(a[1]);d[2]=ARcpD1(a[2]);d[3]=ARcpD1(a[3]);return d;} +//------------------------------------------------------------------------------------------------------------------------------ + A_STATIC retAF2 opARcpF2(outAF2 d,inAF2 a){d[0]=ARcpF1(a[0]);d[1]=ARcpF1(a[1]);return d;} + A_STATIC retAF3 opARcpF3(outAF3 d,inAF3 a){d[0]=ARcpF1(a[0]);d[1]=ARcpF1(a[1]);d[2]=ARcpF1(a[2]);return d;} + A_STATIC retAF4 opARcpF4(outAF4 d,inAF4 a){d[0]=ARcpF1(a[0]);d[1]=ARcpF1(a[1]);d[2]=ARcpF1(a[2]);d[3]=ARcpF1(a[3]);return d;} +//////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////// +//////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////// +//_____________________________________________________________/\_______________________________________________________________ +//============================================================================================================================== +// HALF FLOAT PACKING +//============================================================================================================================== + // Convert float to half (in lower 16-bits of output). + // Same fast technique as documented here: ftp://ftp.fox-toolkit.org/pub/fasthalffloatconversion.pdf + // Supports denormals. + // Conversion rules are to make computations possibly "safer" on the GPU, + // -INF & -NaN -> -65504 + // +INF & +NaN -> +65504 + A_STATIC AU1 AU1_AH1_AF1(AF1 f){ + static AW1 base[512]={ + 0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000, + 0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000, + 0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000, + 0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000, + 0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000, + 0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000, + 0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0001,0x0002,0x0004,0x0008,0x0010,0x0020,0x0040,0x0080,0x0100, + 0x0200,0x0400,0x0800,0x0c00,0x1000,0x1400,0x1800,0x1c00,0x2000,0x2400,0x2800,0x2c00,0x3000,0x3400,0x3800,0x3c00, + 0x4000,0x4400,0x4800,0x4c00,0x5000,0x5400,0x5800,0x5c00,0x6000,0x6400,0x6800,0x6c00,0x7000,0x7400,0x7800,0x7bff, + 0x7bff,0x7bff,0x7bff,0x7bff,0x7bff,0x7bff,0x7bff,0x7bff,0x7bff,0x7bff,0x7bff,0x7bff,0x7bff,0x7bff,0x7bff,0x7bff, + 0x7bff,0x7bff,0x7bff,0x7bff,0x7bff,0x7bff,0x7bff,0x7bff,0x7bff,0x7bff,0x7bff,0x7bff,0x7bff,0x7bff,0x7bff,0x7bff, + 0x7bff,0x7bff,0x7bff,0x7bff,0x7bff,0x7bff,0x7bff,0x7bff,0x7bff,0x7bff,0x7bff,0x7bff,0x7bff,0x7bff,0x7bff,0x7bff, + 0x7bff,0x7bff,0x7bff,0x7bff,0x7bff,0x7bff,0x7bff,0x7bff,0x7bff,0x7bff,0x7bff,0x7bff,0x7bff,0x7bff,0x7bff,0x7bff, + 0x7bff,0x7bff,0x7bff,0x7bff,0x7bff,0x7bff,0x7bff,0x7bff,0x7bff,0x7bff,0x7bff,0x7bff,0x7bff,0x7bff,0x7bff,0x7bff, + 0x7bff,0x7bff,0x7bff,0x7bff,0x7bff,0x7bff,0x7bff,0x7bff,0x7bff,0x7bff,0x7bff,0x7bff,0x7bff,0x7bff,0x7bff,0x7bff, + 0x7bff,0x7bff,0x7bff,0x7bff,0x7bff,0x7bff,0x7bff,0x7bff,0x7bff,0x7bff,0x7bff,0x7bff,0x7bff,0x7bff,0x7bff,0x7bff, + 0x8000,0x8000,0x8000,0x8000,0x8000,0x8000,0x8000,0x8000,0x8000,0x8000,0x8000,0x8000,0x8000,0x8000,0x8000,0x8000, + 0x8000,0x8000,0x8000,0x8000,0x8000,0x8000,0x8000,0x8000,0x8000,0x8000,0x8000,0x8000,0x8000,0x8000,0x8000,0x8000, + 0x8000,0x8000,0x8000,0x8000,0x8000,0x8000,0x8000,0x8000,0x8000,0x8000,0x8000,0x8000,0x8000,0x8000,0x8000,0x8000, + 0x8000,0x8000,0x8000,0x8000,0x8000,0x8000,0x8000,0x8000,0x8000,0x8000,0x8000,0x8000,0x8000,0x8000,0x8000,0x8000, + 0x8000,0x8000,0x8000,0x8000,0x8000,0x8000,0x8000,0x8000,0x8000,0x8000,0x8000,0x8000,0x8000,0x8000,0x8000,0x8000, + 0x8000,0x8000,0x8000,0x8000,0x8000,0x8000,0x8000,0x8000,0x8000,0x8000,0x8000,0x8000,0x8000,0x8000,0x8000,0x8000, + 0x8000,0x8000,0x8000,0x8000,0x8000,0x8000,0x8000,0x8001,0x8002,0x8004,0x8008,0x8010,0x8020,0x8040,0x8080,0x8100, + 0x8200,0x8400,0x8800,0x8c00,0x9000,0x9400,0x9800,0x9c00,0xa000,0xa400,0xa800,0xac00,0xb000,0xb400,0xb800,0xbc00, + 0xc000,0xc400,0xc800,0xcc00,0xd000,0xd400,0xd800,0xdc00,0xe000,0xe400,0xe800,0xec00,0xf000,0xf400,0xf800,0xfbff, + 0xfbff,0xfbff,0xfbff,0xfbff,0xfbff,0xfbff,0xfbff,0xfbff,0xfbff,0xfbff,0xfbff,0xfbff,0xfbff,0xfbff,0xfbff,0xfbff, + 0xfbff,0xfbff,0xfbff,0xfbff,0xfbff,0xfbff,0xfbff,0xfbff,0xfbff,0xfbff,0xfbff,0xfbff,0xfbff,0xfbff,0xfbff,0xfbff, + 0xfbff,0xfbff,0xfbff,0xfbff,0xfbff,0xfbff,0xfbff,0xfbff,0xfbff,0xfbff,0xfbff,0xfbff,0xfbff,0xfbff,0xfbff,0xfbff, + 0xfbff,0xfbff,0xfbff,0xfbff,0xfbff,0xfbff,0xfbff,0xfbff,0xfbff,0xfbff,0xfbff,0xfbff,0xfbff,0xfbff,0xfbff,0xfbff, + 0xfbff,0xfbff,0xfbff,0xfbff,0xfbff,0xfbff,0xfbff,0xfbff,0xfbff,0xfbff,0xfbff,0xfbff,0xfbff,0xfbff,0xfbff,0xfbff, + 0xfbff,0xfbff,0xfbff,0xfbff,0xfbff,0xfbff,0xfbff,0xfbff,0xfbff,0xfbff,0xfbff,0xfbff,0xfbff,0xfbff,0xfbff,0xfbff, + 0xfbff,0xfbff,0xfbff,0xfbff,0xfbff,0xfbff,0xfbff,0xfbff,0xfbff,0xfbff,0xfbff,0xfbff,0xfbff,0xfbff,0xfbff,0xfbff}; + static AB1 shift[512]={ + 0x18,0x18,0x18,0x18,0x18,0x18,0x18,0x18,0x18,0x18,0x18,0x18,0x18,0x18,0x18,0x18, + 0x18,0x18,0x18,0x18,0x18,0x18,0x18,0x18,0x18,0x18,0x18,0x18,0x18,0x18,0x18,0x18, + 0x18,0x18,0x18,0x18,0x18,0x18,0x18,0x18,0x18,0x18,0x18,0x18,0x18,0x18,0x18,0x18, + 0x18,0x18,0x18,0x18,0x18,0x18,0x18,0x18,0x18,0x18,0x18,0x18,0x18,0x18,0x18,0x18, + 0x18,0x18,0x18,0x18,0x18,0x18,0x18,0x18,0x18,0x18,0x18,0x18,0x18,0x18,0x18,0x18, + 0x18,0x18,0x18,0x18,0x18,0x18,0x18,0x18,0x18,0x18,0x18,0x18,0x18,0x18,0x18,0x18, + 0x18,0x18,0x18,0x18,0x18,0x18,0x18,0x17,0x16,0x15,0x14,0x13,0x12,0x11,0x10,0x0f, + 0x0e,0x0d,0x0d,0x0d,0x0d,0x0d,0x0d,0x0d,0x0d,0x0d,0x0d,0x0d,0x0d,0x0d,0x0d,0x0d, + 0x0d,0x0d,0x0d,0x0d,0x0d,0x0d,0x0d,0x0d,0x0d,0x0d,0x0d,0x0d,0x0d,0x0d,0x0d,0x18, + 0x18,0x18,0x18,0x18,0x18,0x18,0x18,0x18,0x18,0x18,0x18,0x18,0x18,0x18,0x18,0x18, + 0x18,0x18,0x18,0x18,0x18,0x18,0x18,0x18,0x18,0x18,0x18,0x18,0x18,0x18,0x18,0x18, + 0x18,0x18,0x18,0x18,0x18,0x18,0x18,0x18,0x18,0x18,0x18,0x18,0x18,0x18,0x18,0x18, + 0x18,0x18,0x18,0x18,0x18,0x18,0x18,0x18,0x18,0x18,0x18,0x18,0x18,0x18,0x18,0x18, + 0x18,0x18,0x18,0x18,0x18,0x18,0x18,0x18,0x18,0x18,0x18,0x18,0x18,0x18,0x18,0x18, + 0x18,0x18,0x18,0x18,0x18,0x18,0x18,0x18,0x18,0x18,0x18,0x18,0x18,0x18,0x18,0x18, + 0x18,0x18,0x18,0x18,0x18,0x18,0x18,0x18,0x18,0x18,0x18,0x18,0x18,0x18,0x18,0x18, + 0x18,0x18,0x18,0x18,0x18,0x18,0x18,0x18,0x18,0x18,0x18,0x18,0x18,0x18,0x18,0x18, + 0x18,0x18,0x18,0x18,0x18,0x18,0x18,0x18,0x18,0x18,0x18,0x18,0x18,0x18,0x18,0x18, + 0x18,0x18,0x18,0x18,0x18,0x18,0x18,0x18,0x18,0x18,0x18,0x18,0x18,0x18,0x18,0x18, + 0x18,0x18,0x18,0x18,0x18,0x18,0x18,0x18,0x18,0x18,0x18,0x18,0x18,0x18,0x18,0x18, + 0x18,0x18,0x18,0x18,0x18,0x18,0x18,0x18,0x18,0x18,0x18,0x18,0x18,0x18,0x18,0x18, + 0x18,0x18,0x18,0x18,0x18,0x18,0x18,0x18,0x18,0x18,0x18,0x18,0x18,0x18,0x18,0x18, + 0x18,0x18,0x18,0x18,0x18,0x18,0x18,0x17,0x16,0x15,0x14,0x13,0x12,0x11,0x10,0x0f, + 0x0e,0x0d,0x0d,0x0d,0x0d,0x0d,0x0d,0x0d,0x0d,0x0d,0x0d,0x0d,0x0d,0x0d,0x0d,0x0d, + 0x0d,0x0d,0x0d,0x0d,0x0d,0x0d,0x0d,0x0d,0x0d,0x0d,0x0d,0x0d,0x0d,0x0d,0x0d,0x18, + 0x18,0x18,0x18,0x18,0x18,0x18,0x18,0x18,0x18,0x18,0x18,0x18,0x18,0x18,0x18,0x18, + 0x18,0x18,0x18,0x18,0x18,0x18,0x18,0x18,0x18,0x18,0x18,0x18,0x18,0x18,0x18,0x18, + 0x18,0x18,0x18,0x18,0x18,0x18,0x18,0x18,0x18,0x18,0x18,0x18,0x18,0x18,0x18,0x18, + 0x18,0x18,0x18,0x18,0x18,0x18,0x18,0x18,0x18,0x18,0x18,0x18,0x18,0x18,0x18,0x18, + 0x18,0x18,0x18,0x18,0x18,0x18,0x18,0x18,0x18,0x18,0x18,0x18,0x18,0x18,0x18,0x18, + 0x18,0x18,0x18,0x18,0x18,0x18,0x18,0x18,0x18,0x18,0x18,0x18,0x18,0x18,0x18,0x18, + 0x18,0x18,0x18,0x18,0x18,0x18,0x18,0x18,0x18,0x18,0x18,0x18,0x18,0x18,0x18,0x18}; + union{AF1 f;AU1 u;}bits;bits.f=f;AU1 u=bits.u;AU1 i=u>>23;return (AU1)(base[i])+((u&0x7fffff)>>shift[i]);} +//------------------------------------------------------------------------------------------------------------------------------ + // Used to output packed constant. + A_STATIC AU1 AU1_AH2_AF2(inAF2 a){return AU1_AH1_AF1(a[0])+(AU1_AH1_AF1(a[1])<<16);} +#endif +//////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////// +//////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////// +//////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////// +//////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////// +//////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////// +//////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////// +//_____________________________________________________________/\_______________________________________________________________ +//============================================================================================================================== +// +// +// GLSL +// +// +//============================================================================================================================== +#if defined(A_GLSL) && defined(A_GPU) + #ifndef A_SKIP_EXT + #ifdef A_HALF + #extension GL_EXT_shader_16bit_storage:require + #extension GL_EXT_shader_explicit_arithmetic_types:require + #endif +//------------------------------------------------------------------------------------------------------------------------------ + #ifdef A_LONG + #extension GL_ARB_gpu_shader_int64:require + #extension GL_NV_shader_atomic_int64:require + #endif +//------------------------------------------------------------------------------------------------------------------------------ + #ifdef A_WAVE + #extension GL_KHR_shader_subgroup_arithmetic:require + #extension GL_KHR_shader_subgroup_ballot:require + #extension GL_KHR_shader_subgroup_quad:require + #extension GL_KHR_shader_subgroup_shuffle:require + #endif + #endif +//============================================================================================================================== + #define AP1 bool + #define AP2 bvec2 + #define AP3 bvec3 + #define AP4 bvec4 +//------------------------------------------------------------------------------------------------------------------------------ + #define AF1 float + #define AF2 vec2 + #define AF3 vec3 + #define AF4 vec4 +//------------------------------------------------------------------------------------------------------------------------------ + #define AU1 uint + #define AU2 uvec2 + #define AU3 uvec3 + #define AU4 uvec4 +//------------------------------------------------------------------------------------------------------------------------------ + #define ASU1 int + #define ASU2 ivec2 + #define ASU3 ivec3 + #define ASU4 ivec4 +//============================================================================================================================== + #define AF1_AU1(x) uintBitsToFloat(AU1(x)) + #define AF2_AU2(x) uintBitsToFloat(AU2(x)) + #define AF3_AU3(x) uintBitsToFloat(AU3(x)) + #define AF4_AU4(x) uintBitsToFloat(AU4(x)) +//------------------------------------------------------------------------------------------------------------------------------ + #define AU1_AF1(x) floatBitsToUint(AF1(x)) + #define AU2_AF2(x) floatBitsToUint(AF2(x)) + #define AU3_AF3(x) floatBitsToUint(AF3(x)) + #define AU4_AF4(x) floatBitsToUint(AF4(x)) +//------------------------------------------------------------------------------------------------------------------------------ + AU1 AU1_AH1_AF1_x(AF1 a){return packHalf2x16(AF2(a,0.0));} + #define AU1_AH1_AF1(a) AU1_AH1_AF1_x(AF1(a)) +//------------------------------------------------------------------------------------------------------------------------------ + #define AU1_AH2_AF2 packHalf2x16 + #define AU1_AW2Unorm_AF2 packUnorm2x16 + #define AU1_AB4Unorm_AF4 packUnorm4x8 +//------------------------------------------------------------------------------------------------------------------------------ + #define AF2_AH2_AU1 unpackHalf2x16 + #define AF2_AW2Unorm_AU1 unpackUnorm2x16 + #define AF4_AB4Unorm_AU1 unpackUnorm4x8 +//============================================================================================================================== + AF1 AF1_x(AF1 a){return AF1(a);} + AF2 AF2_x(AF1 a){return AF2(a,a);} + AF3 AF3_x(AF1 a){return AF3(a,a,a);} + AF4 AF4_x(AF1 a){return AF4(a,a,a,a);} + #define AF1_(a) AF1_x(AF1(a)) + #define AF2_(a) AF2_x(AF1(a)) + #define AF3_(a) AF3_x(AF1(a)) + #define AF4_(a) AF4_x(AF1(a)) +//------------------------------------------------------------------------------------------------------------------------------ + AU1 AU1_x(AU1 a){return AU1(a);} + AU2 AU2_x(AU1 a){return AU2(a,a);} + AU3 AU3_x(AU1 a){return AU3(a,a,a);} + AU4 AU4_x(AU1 a){return AU4(a,a,a,a);} + #define AU1_(a) AU1_x(AU1(a)) + #define AU2_(a) AU2_x(AU1(a)) + #define AU3_(a) AU3_x(AU1(a)) + #define AU4_(a) AU4_x(AU1(a)) +//============================================================================================================================== + AU1 AAbsSU1(AU1 a){return AU1(abs(ASU1(a)));} + AU2 AAbsSU2(AU2 a){return AU2(abs(ASU2(a)));} + AU3 AAbsSU3(AU3 a){return AU3(abs(ASU3(a)));} + AU4 AAbsSU4(AU4 a){return AU4(abs(ASU4(a)));} +//------------------------------------------------------------------------------------------------------------------------------ + AU1 ABfe(AU1 src,AU1 off,AU1 bits){return bitfieldExtract(src,ASU1(off),ASU1(bits));} + AU1 ABfi(AU1 src,AU1 ins,AU1 mask){return (ins&mask)|(src&(~mask));} + // Proxy for V_BFI_B32 where the 'mask' is set as 'bits', 'mask=(1<>ASU1(b));} + AU2 AShrSU2(AU2 a,AU2 b){return AU2(ASU2(a)>>ASU2(b));} + AU3 AShrSU3(AU3 a,AU3 b){return AU3(ASU3(a)>>ASU3(b));} + AU4 AShrSU4(AU4 a,AU4 b){return AU4(ASU4(a)>>ASU4(b));} +//////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////// +//////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////// +//_____________________________________________________________/\_______________________________________________________________ +//============================================================================================================================== +// GLSL BYTE +//============================================================================================================================== + #ifdef A_BYTE + #define AB1 uint8_t + #define AB2 u8vec2 + #define AB3 u8vec3 + #define AB4 u8vec4 +//------------------------------------------------------------------------------------------------------------------------------ + #define ASB1 int8_t + #define ASB2 i8vec2 + #define ASB3 i8vec3 + #define ASB4 i8vec4 +//------------------------------------------------------------------------------------------------------------------------------ + AB1 AB1_x(AB1 a){return AB1(a);} + AB2 AB2_x(AB1 a){return AB2(a,a);} + AB3 AB3_x(AB1 a){return AB3(a,a,a);} + AB4 AB4_x(AB1 a){return AB4(a,a,a,a);} + #define AB1_(a) AB1_x(AB1(a)) + #define AB2_(a) AB2_x(AB1(a)) + #define AB3_(a) AB3_x(AB1(a)) + #define AB4_(a) AB4_x(AB1(a)) + #endif +//////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////// +//////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////// +//_____________________________________________________________/\_______________________________________________________________ +//============================================================================================================================== +// GLSL HALF +//============================================================================================================================== + #ifdef A_HALF + #define AH1 float16_t + #define AH2 f16vec2 + #define AH3 f16vec3 + #define AH4 f16vec4 +//------------------------------------------------------------------------------------------------------------------------------ + #define AW1 uint16_t + #define AW2 u16vec2 + #define AW3 u16vec3 + #define AW4 u16vec4 +//------------------------------------------------------------------------------------------------------------------------------ + #define ASW1 int16_t + #define ASW2 i16vec2 + #define ASW3 i16vec3 + #define ASW4 i16vec4 +//============================================================================================================================== + #define AH2_AU1(x) unpackFloat2x16(AU1(x)) + AH4 AH4_AU2_x(AU2 x){return AH4(unpackFloat2x16(x.x),unpackFloat2x16(x.y));} + #define AH4_AU2(x) AH4_AU2_x(AU2(x)) + #define AW2_AU1(x) unpackUint2x16(AU1(x)) + #define AW4_AU2(x) unpackUint4x16(pack64(AU2(x))) +//------------------------------------------------------------------------------------------------------------------------------ + #define AU1_AH2(x) packFloat2x16(AH2(x)) + AU2 AU2_AH4_x(AH4 x){return AU2(packFloat2x16(x.xy),packFloat2x16(x.zw));} + #define AU2_AH4(x) AU2_AH4_x(AH4(x)) + #define AU1_AW2(x) packUint2x16(AW2(x)) + #define AU2_AW4(x) unpack32(packUint4x16(AW4(x))) +//============================================================================================================================== + #define AW1_AH1(x) halfBitsToUint16(AH1(x)) + #define AW2_AH2(x) halfBitsToUint16(AH2(x)) + #define AW3_AH3(x) halfBitsToUint16(AH3(x)) + #define AW4_AH4(x) halfBitsToUint16(AH4(x)) +//------------------------------------------------------------------------------------------------------------------------------ + #define AH1_AW1(x) uint16BitsToHalf(AW1(x)) + #define AH2_AW2(x) uint16BitsToHalf(AW2(x)) + #define AH3_AW3(x) uint16BitsToHalf(AW3(x)) + #define AH4_AW4(x) uint16BitsToHalf(AW4(x)) +//============================================================================================================================== + AH1 AH1_x(AH1 a){return AH1(a);} + AH2 AH2_x(AH1 a){return AH2(a,a);} + AH3 AH3_x(AH1 a){return AH3(a,a,a);} + AH4 AH4_x(AH1 a){return AH4(a,a,a,a);} + #define AH1_(a) AH1_x(AH1(a)) + #define AH2_(a) AH2_x(AH1(a)) + #define AH3_(a) AH3_x(AH1(a)) + #define AH4_(a) AH4_x(AH1(a)) +//------------------------------------------------------------------------------------------------------------------------------ + AW1 AW1_x(AW1 a){return AW1(a);} + AW2 AW2_x(AW1 a){return AW2(a,a);} + AW3 AW3_x(AW1 a){return AW3(a,a,a);} + AW4 AW4_x(AW1 a){return AW4(a,a,a,a);} + #define AW1_(a) AW1_x(AW1(a)) + #define AW2_(a) AW2_x(AW1(a)) + #define AW3_(a) AW3_x(AW1(a)) + #define AW4_(a) AW4_x(AW1(a)) +//============================================================================================================================== + AW1 AAbsSW1(AW1 a){return AW1(abs(ASW1(a)));} + AW2 AAbsSW2(AW2 a){return AW2(abs(ASW2(a)));} + AW3 AAbsSW3(AW3 a){return AW3(abs(ASW3(a)));} + AW4 AAbsSW4(AW4 a){return AW4(abs(ASW4(a)));} +//------------------------------------------------------------------------------------------------------------------------------ + AH1 AClampH1(AH1 x,AH1 n,AH1 m){return clamp(x,n,m);} + AH2 AClampH2(AH2 x,AH2 n,AH2 m){return clamp(x,n,m);} + AH3 AClampH3(AH3 x,AH3 n,AH3 m){return clamp(x,n,m);} + AH4 AClampH4(AH4 x,AH4 n,AH4 m){return clamp(x,n,m);} +//------------------------------------------------------------------------------------------------------------------------------ + AH1 AFractH1(AH1 x){return fract(x);} + AH2 AFractH2(AH2 x){return fract(x);} + AH3 AFractH3(AH3 x){return fract(x);} + AH4 AFractH4(AH4 x){return fract(x);} +//------------------------------------------------------------------------------------------------------------------------------ + AH1 ALerpH1(AH1 x,AH1 y,AH1 a){return mix(x,y,a);} + AH2 ALerpH2(AH2 x,AH2 y,AH2 a){return mix(x,y,a);} + AH3 ALerpH3(AH3 x,AH3 y,AH3 a){return mix(x,y,a);} + AH4 ALerpH4(AH4 x,AH4 y,AH4 a){return mix(x,y,a);} +//------------------------------------------------------------------------------------------------------------------------------ + // No packed version of max3. + AH1 AMax3H1(AH1 x,AH1 y,AH1 z){return max(x,max(y,z));} + AH2 AMax3H2(AH2 x,AH2 y,AH2 z){return max(x,max(y,z));} + AH3 AMax3H3(AH3 x,AH3 y,AH3 z){return max(x,max(y,z));} + AH4 AMax3H4(AH4 x,AH4 y,AH4 z){return max(x,max(y,z));} +//------------------------------------------------------------------------------------------------------------------------------ + AW1 AMaxSW1(AW1 a,AW1 b){return AW1(max(ASU1(a),ASU1(b)));} + AW2 AMaxSW2(AW2 a,AW2 b){return AW2(max(ASU2(a),ASU2(b)));} + AW3 AMaxSW3(AW3 a,AW3 b){return AW3(max(ASU3(a),ASU3(b)));} + AW4 AMaxSW4(AW4 a,AW4 b){return AW4(max(ASU4(a),ASU4(b)));} +//------------------------------------------------------------------------------------------------------------------------------ + // No packed version of min3. + AH1 AMin3H1(AH1 x,AH1 y,AH1 z){return min(x,min(y,z));} + AH2 AMin3H2(AH2 x,AH2 y,AH2 z){return min(x,min(y,z));} + AH3 AMin3H3(AH3 x,AH3 y,AH3 z){return min(x,min(y,z));} + AH4 AMin3H4(AH4 x,AH4 y,AH4 z){return min(x,min(y,z));} +//------------------------------------------------------------------------------------------------------------------------------ + AW1 AMinSW1(AW1 a,AW1 b){return AW1(min(ASU1(a),ASU1(b)));} + AW2 AMinSW2(AW2 a,AW2 b){return AW2(min(ASU2(a),ASU2(b)));} + AW3 AMinSW3(AW3 a,AW3 b){return AW3(min(ASU3(a),ASU3(b)));} + AW4 AMinSW4(AW4 a,AW4 b){return AW4(min(ASU4(a),ASU4(b)));} +//------------------------------------------------------------------------------------------------------------------------------ + AH1 ARcpH1(AH1 x){return AH1_(1.0)/x;} + AH2 ARcpH2(AH2 x){return AH2_(1.0)/x;} + AH3 ARcpH3(AH3 x){return AH3_(1.0)/x;} + AH4 ARcpH4(AH4 x){return AH4_(1.0)/x;} +//------------------------------------------------------------------------------------------------------------------------------ + AH1 ARsqH1(AH1 x){return AH1_(1.0)/sqrt(x);} + AH2 ARsqH2(AH2 x){return AH2_(1.0)/sqrt(x);} + AH3 ARsqH3(AH3 x){return AH3_(1.0)/sqrt(x);} + AH4 ARsqH4(AH4 x){return AH4_(1.0)/sqrt(x);} +//------------------------------------------------------------------------------------------------------------------------------ + AH1 ASatH1(AH1 x){return clamp(x,AH1_(0.0),AH1_(1.0));} + AH2 ASatH2(AH2 x){return clamp(x,AH2_(0.0),AH2_(1.0));} + AH3 ASatH3(AH3 x){return clamp(x,AH3_(0.0),AH3_(1.0));} + AH4 ASatH4(AH4 x){return clamp(x,AH4_(0.0),AH4_(1.0));} +//------------------------------------------------------------------------------------------------------------------------------ + AW1 AShrSW1(AW1 a,AW1 b){return AW1(ASW1(a)>>ASW1(b));} + AW2 AShrSW2(AW2 a,AW2 b){return AW2(ASW2(a)>>ASW2(b));} + AW3 AShrSW3(AW3 a,AW3 b){return AW3(ASW3(a)>>ASW3(b));} + AW4 AShrSW4(AW4 a,AW4 b){return AW4(ASW4(a)>>ASW4(b));} + #endif +//////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////// +//////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////// +//_____________________________________________________________/\_______________________________________________________________ +//============================================================================================================================== +// GLSL DOUBLE +//============================================================================================================================== + #ifdef A_DUBL + #define AD1 double + #define AD2 dvec2 + #define AD3 dvec3 + #define AD4 dvec4 +//------------------------------------------------------------------------------------------------------------------------------ + AD1 AD1_x(AD1 a){return AD1(a);} + AD2 AD2_x(AD1 a){return AD2(a,a);} + AD3 AD3_x(AD1 a){return AD3(a,a,a);} + AD4 AD4_x(AD1 a){return AD4(a,a,a,a);} + #define AD1_(a) AD1_x(AD1(a)) + #define AD2_(a) AD2_x(AD1(a)) + #define AD3_(a) AD3_x(AD1(a)) + #define AD4_(a) AD4_x(AD1(a)) +//============================================================================================================================== + AD1 AFractD1(AD1 x){return fract(x);} + AD2 AFractD2(AD2 x){return fract(x);} + AD3 AFractD3(AD3 x){return fract(x);} + AD4 AFractD4(AD4 x){return fract(x);} +//------------------------------------------------------------------------------------------------------------------------------ + AD1 ALerpD1(AD1 x,AD1 y,AD1 a){return mix(x,y,a);} + AD2 ALerpD2(AD2 x,AD2 y,AD2 a){return mix(x,y,a);} + AD3 ALerpD3(AD3 x,AD3 y,AD3 a){return mix(x,y,a);} + AD4 ALerpD4(AD4 x,AD4 y,AD4 a){return mix(x,y,a);} +//------------------------------------------------------------------------------------------------------------------------------ + AD1 ARcpD1(AD1 x){return AD1_(1.0)/x;} + AD2 ARcpD2(AD2 x){return AD2_(1.0)/x;} + AD3 ARcpD3(AD3 x){return AD3_(1.0)/x;} + AD4 ARcpD4(AD4 x){return AD4_(1.0)/x;} +//------------------------------------------------------------------------------------------------------------------------------ + AD1 ARsqD1(AD1 x){return AD1_(1.0)/sqrt(x);} + AD2 ARsqD2(AD2 x){return AD2_(1.0)/sqrt(x);} + AD3 ARsqD3(AD3 x){return AD3_(1.0)/sqrt(x);} + AD4 ARsqD4(AD4 x){return AD4_(1.0)/sqrt(x);} +//------------------------------------------------------------------------------------------------------------------------------ + AD1 ASatD1(AD1 x){return clamp(x,AD1_(0.0),AD1_(1.0));} + AD2 ASatD2(AD2 x){return clamp(x,AD2_(0.0),AD2_(1.0));} + AD3 ASatD3(AD3 x){return clamp(x,AD3_(0.0),AD3_(1.0));} + AD4 ASatD4(AD4 x){return clamp(x,AD4_(0.0),AD4_(1.0));} + #endif +//////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////// +//////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////// +//_____________________________________________________________/\_______________________________________________________________ +//============================================================================================================================== +// GLSL LONG +//============================================================================================================================== + #ifdef A_LONG + #define AL1 uint64_t + #define AL2 u64vec2 + #define AL3 u64vec3 + #define AL4 u64vec4 +//------------------------------------------------------------------------------------------------------------------------------ + #define ASL1 int64_t + #define ASL2 i64vec2 + #define ASL3 i64vec3 + #define ASL4 i64vec4 +//------------------------------------------------------------------------------------------------------------------------------ + #define AL1_AU2(x) packUint2x32(AU2(x)) + #define AU2_AL1(x) unpackUint2x32(AL1(x)) +//------------------------------------------------------------------------------------------------------------------------------ + AL1 AL1_x(AL1 a){return AL1(a);} + AL2 AL2_x(AL1 a){return AL2(a,a);} + AL3 AL3_x(AL1 a){return AL3(a,a,a);} + AL4 AL4_x(AL1 a){return AL4(a,a,a,a);} + #define AL1_(a) AL1_x(AL1(a)) + #define AL2_(a) AL2_x(AL1(a)) + #define AL3_(a) AL3_x(AL1(a)) + #define AL4_(a) AL4_x(AL1(a)) +//============================================================================================================================== + AL1 AAbsSL1(AL1 a){return AL1(abs(ASL1(a)));} + AL2 AAbsSL2(AL2 a){return AL2(abs(ASL2(a)));} + AL3 AAbsSL3(AL3 a){return AL3(abs(ASL3(a)));} + AL4 AAbsSL4(AL4 a){return AL4(abs(ASL4(a)));} +//------------------------------------------------------------------------------------------------------------------------------ + AL1 AMaxSL1(AL1 a,AL1 b){return AL1(max(ASU1(a),ASU1(b)));} + AL2 AMaxSL2(AL2 a,AL2 b){return AL2(max(ASU2(a),ASU2(b)));} + AL3 AMaxSL3(AL3 a,AL3 b){return AL3(max(ASU3(a),ASU3(b)));} + AL4 AMaxSL4(AL4 a,AL4 b){return AL4(max(ASU4(a),ASU4(b)));} +//------------------------------------------------------------------------------------------------------------------------------ + AL1 AMinSL1(AL1 a,AL1 b){return AL1(min(ASU1(a),ASU1(b)));} + AL2 AMinSL2(AL2 a,AL2 b){return AL2(min(ASU2(a),ASU2(b)));} + AL3 AMinSL3(AL3 a,AL3 b){return AL3(min(ASU3(a),ASU3(b)));} + AL4 AMinSL4(AL4 a,AL4 b){return AL4(min(ASU4(a),ASU4(b)));} + #endif +//////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////// +//////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////// +//_____________________________________________________________/\_______________________________________________________________ +//============================================================================================================================== +// WAVE OPERATIONS +//============================================================================================================================== + #ifdef A_WAVE + // Where 'x' must be a compile time literal. + AF1 AWaveXorF1(AF1 v,AU1 x){return subgroupShuffleXor(v,x);} + AF2 AWaveXorF2(AF2 v,AU1 x){return subgroupShuffleXor(v,x);} + AF3 AWaveXorF3(AF3 v,AU1 x){return subgroupShuffleXor(v,x);} + AF4 AWaveXorF4(AF4 v,AU1 x){return subgroupShuffleXor(v,x);} + AU1 AWaveXorU1(AU1 v,AU1 x){return subgroupShuffleXor(v,x);} + AU2 AWaveXorU2(AU2 v,AU1 x){return subgroupShuffleXor(v,x);} + AU3 AWaveXorU3(AU3 v,AU1 x){return subgroupShuffleXor(v,x);} + AU4 AWaveXorU4(AU4 v,AU1 x){return subgroupShuffleXor(v,x);} +//------------------------------------------------------------------------------------------------------------------------------ + #ifdef A_HALF + AH2 AWaveXorH2(AH2 v,AU1 x){return AH2_AU1(subgroupShuffleXor(AU1_AH2(v),x));} + AH4 AWaveXorH4(AH4 v,AU1 x){return AH4_AU2(subgroupShuffleXor(AU2_AH4(v),x));} + AW2 AWaveXorW2(AW2 v,AU1 x){return AW2_AU1(subgroupShuffleXor(AU1_AW2(v),x));} + AW4 AWaveXorW4(AW4 v,AU1 x){return AW4_AU2(subgroupShuffleXor(AU2_AW4(v),x));} + #endif + #endif +//============================================================================================================================== +#endif +//////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////// +//////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////// +//////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////// +//////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////// +//////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////// +//////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////// +//_____________________________________________________________/\_______________________________________________________________ +//============================================================================================================================== +// +// +// HLSL +// +// +//============================================================================================================================== +#if defined(A_HLSL) && defined(A_GPU) + #ifdef A_HLSL_6_2 + #define AP1 bool + #define AP2 bool2 + #define AP3 bool3 + #define AP4 bool4 +//------------------------------------------------------------------------------------------------------------------------------ + #define AF1 float32_t + #define AF2 float32_t2 + #define AF3 float32_t3 + #define AF4 float32_t4 +//------------------------------------------------------------------------------------------------------------------------------ + #define AU1 uint32_t + #define AU2 uint32_t2 + #define AU3 uint32_t3 + #define AU4 uint32_t4 +//------------------------------------------------------------------------------------------------------------------------------ + #define ASU1 int32_t + #define ASU2 int32_t2 + #define ASU3 int32_t3 + #define ASU4 int32_t4 + #else + #define AP1 bool + #define AP2 bool2 + #define AP3 bool3 + #define AP4 bool4 +//------------------------------------------------------------------------------------------------------------------------------ + #define AF1 float + #define AF2 float2 + #define AF3 float3 + #define AF4 float4 +//------------------------------------------------------------------------------------------------------------------------------ + #define AU1 uint + #define AU2 uint2 + #define AU3 uint3 + #define AU4 uint4 +//------------------------------------------------------------------------------------------------------------------------------ + #define ASU1 int + #define ASU2 int2 + #define ASU3 int3 + #define ASU4 int4 + #endif +//============================================================================================================================== + #define AF1_AU1(x) asfloat(AU1(x)) + #define AF2_AU2(x) asfloat(AU2(x)) + #define AF3_AU3(x) asfloat(AU3(x)) + #define AF4_AU4(x) asfloat(AU4(x)) +//------------------------------------------------------------------------------------------------------------------------------ + #define AU1_AF1(x) asuint(AF1(x)) + #define AU2_AF2(x) asuint(AF2(x)) + #define AU3_AF3(x) asuint(AF3(x)) + #define AU4_AF4(x) asuint(AF4(x)) +//------------------------------------------------------------------------------------------------------------------------------ + AU1 AU1_AH1_AF1_x(AF1 a){return f32tof16(a);} + #define AU1_AH1_AF1(a) AU1_AH1_AF1_x(AF1(a)) +//------------------------------------------------------------------------------------------------------------------------------ + AU1 AU1_AH2_AF2_x(AF2 a){return f32tof16(a.x)|(f32tof16(a.y)<<16);} + #define AU1_AH2_AF2(a) AU1_AH2_AF2_x(AF2(a)) + #define AU1_AB4Unorm_AF4(x) D3DCOLORtoUBYTE4(AF4(x)) +//------------------------------------------------------------------------------------------------------------------------------ + AF2 AF2_AH2_AU1_x(AU1 x){return AF2(f16tof32(x&0xFFFF),f16tof32(x>>16));} + #define AF2_AH2_AU1(x) AF2_AH2_AU1_x(AU1(x)) +//============================================================================================================================== + AF1 AF1_x(AF1 a){return AF1(a);} + AF2 AF2_x(AF1 a){return AF2(a,a);} + AF3 AF3_x(AF1 a){return AF3(a,a,a);} + AF4 AF4_x(AF1 a){return AF4(a,a,a,a);} + #define AF1_(a) AF1_x(AF1(a)) + #define AF2_(a) AF2_x(AF1(a)) + #define AF3_(a) AF3_x(AF1(a)) + #define AF4_(a) AF4_x(AF1(a)) +//------------------------------------------------------------------------------------------------------------------------------ + AU1 AU1_x(AU1 a){return AU1(a);} + AU2 AU2_x(AU1 a){return AU2(a,a);} + AU3 AU3_x(AU1 a){return AU3(a,a,a);} + AU4 AU4_x(AU1 a){return AU4(a,a,a,a);} + #define AU1_(a) AU1_x(AU1(a)) + #define AU2_(a) AU2_x(AU1(a)) + #define AU3_(a) AU3_x(AU1(a)) + #define AU4_(a) AU4_x(AU1(a)) +//============================================================================================================================== + AU1 AAbsSU1(AU1 a){return AU1(abs(ASU1(a)));} + AU2 AAbsSU2(AU2 a){return AU2(abs(ASU2(a)));} + AU3 AAbsSU3(AU3 a){return AU3(abs(ASU3(a)));} + AU4 AAbsSU4(AU4 a){return AU4(abs(ASU4(a)));} +//------------------------------------------------------------------------------------------------------------------------------ + AU1 ABfe(AU1 src,AU1 off,AU1 bits){AU1 mask=(1u<>off)&mask;} + AU1 ABfi(AU1 src,AU1 ins,AU1 mask){return (ins&mask)|(src&(~mask));} + AU1 ABfiM(AU1 src,AU1 ins,AU1 bits){AU1 mask=(1u<>ASU1(b));} + AU2 AShrSU2(AU2 a,AU2 b){return AU2(ASU2(a)>>ASU2(b));} + AU3 AShrSU3(AU3 a,AU3 b){return AU3(ASU3(a)>>ASU3(b));} + AU4 AShrSU4(AU4 a,AU4 b){return AU4(ASU4(a)>>ASU4(b));} +//////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////// +//////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////// +//_____________________________________________________________/\_______________________________________________________________ +//============================================================================================================================== +// HLSL BYTE +//============================================================================================================================== + #ifdef A_BYTE + #endif +//////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////// +//////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////// +//_____________________________________________________________/\_______________________________________________________________ +//============================================================================================================================== +// HLSL HALF +//============================================================================================================================== + #ifdef A_HALF + #ifdef A_HLSL_6_2 + #define AH1 float16_t + #define AH2 float16_t2 + #define AH3 float16_t3 + #define AH4 float16_t4 +//------------------------------------------------------------------------------------------------------------------------------ + #define AW1 uint16_t + #define AW2 uint16_t2 + #define AW3 uint16_t3 + #define AW4 uint16_t4 +//------------------------------------------------------------------------------------------------------------------------------ + #define ASW1 int16_t + #define ASW2 int16_t2 + #define ASW3 int16_t3 + #define ASW4 int16_t4 + #else + #define AH1 min16float + #define AH2 min16float2 + #define AH3 min16float3 + #define AH4 min16float4 +//------------------------------------------------------------------------------------------------------------------------------ + #define AW1 min16uint + #define AW2 min16uint2 + #define AW3 min16uint3 + #define AW4 min16uint4 +//------------------------------------------------------------------------------------------------------------------------------ + #define ASW1 min16int + #define ASW2 min16int2 + #define ASW3 min16int3 + #define ASW4 min16int4 + #endif +//============================================================================================================================== + // Need to use manual unpack to get optimal execution (don't use packed types in buffers directly). + // Unpack requires this pattern: https://gpuopen.com/first-steps-implementing-fp16/ + AH2 AH2_AU1_x(AU1 x){AF2 t=f16tof32(AU2(x&0xFFFF,x>>16));return AH2(t);} + AH4 AH4_AU2_x(AU2 x){return AH4(AH2_AU1_x(x.x),AH2_AU1_x(x.y));} + AW2 AW2_AU1_x(AU1 x){AU2 t=AU2(x&0xFFFF,x>>16);return AW2(t);} + AW4 AW4_AU2_x(AU2 x){return AW4(AW2_AU1_x(x.x),AW2_AU1_x(x.y));} + #define AH2_AU1(x) AH2_AU1_x(AU1(x)) + #define AH4_AU2(x) AH4_AU2_x(AU2(x)) + #define AW2_AU1(x) AW2_AU1_x(AU1(x)) + #define AW4_AU2(x) AW4_AU2_x(AU2(x)) +//------------------------------------------------------------------------------------------------------------------------------ + AU1 AU1_AH2_x(AH2 x){return f32tof16(x.x)+(f32tof16(x.y)<<16);} + AU2 AU2_AH4_x(AH4 x){return AU2(AU1_AH2_x(x.xy),AU1_AH2_x(x.zw));} + AU1 AU1_AW2_x(AW2 x){return AU1(x.x)+(AU1(x.y)<<16);} + AU2 AU2_AW4_x(AW4 x){return AU2(AU1_AW2_x(x.xy),AU1_AW2_x(x.zw));} + #define AU1_AH2(x) AU1_AH2_x(AH2(x)) + #define AU2_AH4(x) AU2_AH4_x(AH4(x)) + #define AU1_AW2(x) AU1_AW2_x(AW2(x)) + #define AU2_AW4(x) AU2_AW4_x(AW4(x)) +//============================================================================================================================== + #if defined(A_HLSL_6_2) && !defined(A_NO_16_BIT_CAST) + #define AW1_AH1(x) asuint16(x) + #define AW2_AH2(x) asuint16(x) + #define AW3_AH3(x) asuint16(x) + #define AW4_AH4(x) asuint16(x) + #else + #define AW1_AH1(a) AW1(f32tof16(AF1(a))) + #define AW2_AH2(a) AW2(AW1_AH1((a).x),AW1_AH1((a).y)) + #define AW3_AH3(a) AW3(AW1_AH1((a).x),AW1_AH1((a).y),AW1_AH1((a).z)) + #define AW4_AH4(a) AW4(AW1_AH1((a).x),AW1_AH1((a).y),AW1_AH1((a).z),AW1_AH1((a).w)) + #endif +//------------------------------------------------------------------------------------------------------------------------------ + #if defined(A_HLSL_6_2) && !defined(A_NO_16_BIT_CAST) + #define AH1_AW1(x) asfloat16(x) + #define AH2_AW2(x) asfloat16(x) + #define AH3_AW3(x) asfloat16(x) + #define AH4_AW4(x) asfloat16(x) + #else + #define AH1_AW1(a) AH1(f16tof32(AU1(a))) + #define AH2_AW2(a) AH2(AH1_AW1((a).x),AH1_AW1((a).y)) + #define AH3_AW3(a) AH3(AH1_AW1((a).x),AH1_AW1((a).y),AH1_AW1((a).z)) + #define AH4_AW4(a) AH4(AH1_AW1((a).x),AH1_AW1((a).y),AH1_AW1((a).z),AH1_AW1((a).w)) + #endif +//============================================================================================================================== + AH1 AH1_x(AH1 a){return AH1(a);} + AH2 AH2_x(AH1 a){return AH2(a,a);} + AH3 AH3_x(AH1 a){return AH3(a,a,a);} + AH4 AH4_x(AH1 a){return AH4(a,a,a,a);} + #define AH1_(a) AH1_x(AH1(a)) + #define AH2_(a) AH2_x(AH1(a)) + #define AH3_(a) AH3_x(AH1(a)) + #define AH4_(a) AH4_x(AH1(a)) +//------------------------------------------------------------------------------------------------------------------------------ + AW1 AW1_x(AW1 a){return AW1(a);} + AW2 AW2_x(AW1 a){return AW2(a,a);} + AW3 AW3_x(AW1 a){return AW3(a,a,a);} + AW4 AW4_x(AW1 a){return AW4(a,a,a,a);} + #define AW1_(a) AW1_x(AW1(a)) + #define AW2_(a) AW2_x(AW1(a)) + #define AW3_(a) AW3_x(AW1(a)) + #define AW4_(a) AW4_x(AW1(a)) +//============================================================================================================================== + AW1 AAbsSW1(AW1 a){return AW1(abs(ASW1(a)));} + AW2 AAbsSW2(AW2 a){return AW2(abs(ASW2(a)));} + AW3 AAbsSW3(AW3 a){return AW3(abs(ASW3(a)));} + AW4 AAbsSW4(AW4 a){return AW4(abs(ASW4(a)));} +//------------------------------------------------------------------------------------------------------------------------------ + AH1 AClampH1(AH1 x,AH1 n,AH1 m){return max(n,min(x,m));} + AH2 AClampH2(AH2 x,AH2 n,AH2 m){return max(n,min(x,m));} + AH3 AClampH3(AH3 x,AH3 n,AH3 m){return max(n,min(x,m));} + AH4 AClampH4(AH4 x,AH4 n,AH4 m){return max(n,min(x,m));} +//------------------------------------------------------------------------------------------------------------------------------ + // V_FRACT_F16 (note DX frac() is different). + AH1 AFractH1(AH1 x){return x-floor(x);} + AH2 AFractH2(AH2 x){return x-floor(x);} + AH3 AFractH3(AH3 x){return x-floor(x);} + AH4 AFractH4(AH4 x){return x-floor(x);} +//------------------------------------------------------------------------------------------------------------------------------ + AH1 ALerpH1(AH1 x,AH1 y,AH1 a){return lerp(x,y,a);} + AH2 ALerpH2(AH2 x,AH2 y,AH2 a){return lerp(x,y,a);} + AH3 ALerpH3(AH3 x,AH3 y,AH3 a){return lerp(x,y,a);} + AH4 ALerpH4(AH4 x,AH4 y,AH4 a){return lerp(x,y,a);} +//------------------------------------------------------------------------------------------------------------------------------ + AH1 AMax3H1(AH1 x,AH1 y,AH1 z){return max(x,max(y,z));} + AH2 AMax3H2(AH2 x,AH2 y,AH2 z){return max(x,max(y,z));} + AH3 AMax3H3(AH3 x,AH3 y,AH3 z){return max(x,max(y,z));} + AH4 AMax3H4(AH4 x,AH4 y,AH4 z){return max(x,max(y,z));} +//------------------------------------------------------------------------------------------------------------------------------ + AW1 AMaxSW1(AW1 a,AW1 b){return AW1(max(ASU1(a),ASU1(b)));} + AW2 AMaxSW2(AW2 a,AW2 b){return AW2(max(ASU2(a),ASU2(b)));} + AW3 AMaxSW3(AW3 a,AW3 b){return AW3(max(ASU3(a),ASU3(b)));} + AW4 AMaxSW4(AW4 a,AW4 b){return AW4(max(ASU4(a),ASU4(b)));} +//------------------------------------------------------------------------------------------------------------------------------ + AH1 AMin3H1(AH1 x,AH1 y,AH1 z){return min(x,min(y,z));} + AH2 AMin3H2(AH2 x,AH2 y,AH2 z){return min(x,min(y,z));} + AH3 AMin3H3(AH3 x,AH3 y,AH3 z){return min(x,min(y,z));} + AH4 AMin3H4(AH4 x,AH4 y,AH4 z){return min(x,min(y,z));} +//------------------------------------------------------------------------------------------------------------------------------ + AW1 AMinSW1(AW1 a,AW1 b){return AW1(min(ASU1(a),ASU1(b)));} + AW2 AMinSW2(AW2 a,AW2 b){return AW2(min(ASU2(a),ASU2(b)));} + AW3 AMinSW3(AW3 a,AW3 b){return AW3(min(ASU3(a),ASU3(b)));} + AW4 AMinSW4(AW4 a,AW4 b){return AW4(min(ASU4(a),ASU4(b)));} +//------------------------------------------------------------------------------------------------------------------------------ + AH1 ARcpH1(AH1 x){return rcp(x);} + AH2 ARcpH2(AH2 x){return rcp(x);} + AH3 ARcpH3(AH3 x){return rcp(x);} + AH4 ARcpH4(AH4 x){return rcp(x);} +//------------------------------------------------------------------------------------------------------------------------------ + AH1 ARsqH1(AH1 x){return rsqrt(x);} + AH2 ARsqH2(AH2 x){return rsqrt(x);} + AH3 ARsqH3(AH3 x){return rsqrt(x);} + AH4 ARsqH4(AH4 x){return rsqrt(x);} +//------------------------------------------------------------------------------------------------------------------------------ + AH1 ASatH1(AH1 x){return saturate(x);} + AH2 ASatH2(AH2 x){return saturate(x);} + AH3 ASatH3(AH3 x){return saturate(x);} + AH4 ASatH4(AH4 x){return saturate(x);} +//------------------------------------------------------------------------------------------------------------------------------ + AW1 AShrSW1(AW1 a,AW1 b){return AW1(ASW1(a)>>ASW1(b));} + AW2 AShrSW2(AW2 a,AW2 b){return AW2(ASW2(a)>>ASW2(b));} + AW3 AShrSW3(AW3 a,AW3 b){return AW3(ASW3(a)>>ASW3(b));} + AW4 AShrSW4(AW4 a,AW4 b){return AW4(ASW4(a)>>ASW4(b));} + #endif +//////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////// +//////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////// +//_____________________________________________________________/\_______________________________________________________________ +//============================================================================================================================== +// HLSL DOUBLE +//============================================================================================================================== + #ifdef A_DUBL + #ifdef A_HLSL_6_2 + #define AD1 float64_t + #define AD2 float64_t2 + #define AD3 float64_t3 + #define AD4 float64_t4 + #else + #define AD1 double + #define AD2 double2 + #define AD3 double3 + #define AD4 double4 + #endif +//------------------------------------------------------------------------------------------------------------------------------ + AD1 AD1_x(AD1 a){return AD1(a);} + AD2 AD2_x(AD1 a){return AD2(a,a);} + AD3 AD3_x(AD1 a){return AD3(a,a,a);} + AD4 AD4_x(AD1 a){return AD4(a,a,a,a);} + #define AD1_(a) AD1_x(AD1(a)) + #define AD2_(a) AD2_x(AD1(a)) + #define AD3_(a) AD3_x(AD1(a)) + #define AD4_(a) AD4_x(AD1(a)) +//============================================================================================================================== + AD1 AFractD1(AD1 a){return a-floor(a);} + AD2 AFractD2(AD2 a){return a-floor(a);} + AD3 AFractD3(AD3 a){return a-floor(a);} + AD4 AFractD4(AD4 a){return a-floor(a);} +//------------------------------------------------------------------------------------------------------------------------------ + AD1 ALerpD1(AD1 x,AD1 y,AD1 a){return lerp(x,y,a);} + AD2 ALerpD2(AD2 x,AD2 y,AD2 a){return lerp(x,y,a);} + AD3 ALerpD3(AD3 x,AD3 y,AD3 a){return lerp(x,y,a);} + AD4 ALerpD4(AD4 x,AD4 y,AD4 a){return lerp(x,y,a);} +//------------------------------------------------------------------------------------------------------------------------------ + AD1 ARcpD1(AD1 x){return rcp(x);} + AD2 ARcpD2(AD2 x){return rcp(x);} + AD3 ARcpD3(AD3 x){return rcp(x);} + AD4 ARcpD4(AD4 x){return rcp(x);} +//------------------------------------------------------------------------------------------------------------------------------ + AD1 ARsqD1(AD1 x){return rsqrt(x);} + AD2 ARsqD2(AD2 x){return rsqrt(x);} + AD3 ARsqD3(AD3 x){return rsqrt(x);} + AD4 ARsqD4(AD4 x){return rsqrt(x);} +//------------------------------------------------------------------------------------------------------------------------------ + AD1 ASatD1(AD1 x){return saturate(x);} + AD2 ASatD2(AD2 x){return saturate(x);} + AD3 ASatD3(AD3 x){return saturate(x);} + AD4 ASatD4(AD4 x){return saturate(x);} + #endif +//============================================================================================================================== +// HLSL WAVE +//============================================================================================================================== + #ifdef A_WAVE + // Where 'x' must be a compile time literal. + AF1 AWaveXorF1(AF1 v,AU1 x){return WaveReadLaneAt(v,WaveGetLaneIndex()^x);} + AF2 AWaveXorF2(AF2 v,AU1 x){return WaveReadLaneAt(v,WaveGetLaneIndex()^x);} + AF3 AWaveXorF3(AF3 v,AU1 x){return WaveReadLaneAt(v,WaveGetLaneIndex()^x);} + AF4 AWaveXorF4(AF4 v,AU1 x){return WaveReadLaneAt(v,WaveGetLaneIndex()^x);} + AU1 AWaveXorU1(AU1 v,AU1 x){return WaveReadLaneAt(v,WaveGetLaneIndex()^x);} + AU2 AWaveXorU1(AU2 v,AU1 x){return WaveReadLaneAt(v,WaveGetLaneIndex()^x);} + AU3 AWaveXorU1(AU3 v,AU1 x){return WaveReadLaneAt(v,WaveGetLaneIndex()^x);} + AU4 AWaveXorU1(AU4 v,AU1 x){return WaveReadLaneAt(v,WaveGetLaneIndex()^x);} +//------------------------------------------------------------------------------------------------------------------------------ + #ifdef A_HALF + AH2 AWaveXorH2(AH2 v,AU1 x){return AH2_AU1(WaveReadLaneAt(AU1_AH2(v),WaveGetLaneIndex()^x));} + AH4 AWaveXorH4(AH4 v,AU1 x){return AH4_AU2(WaveReadLaneAt(AU2_AH4(v),WaveGetLaneIndex()^x));} + AW2 AWaveXorW2(AW2 v,AU1 x){return AW2_AU1(WaveReadLaneAt(AU1_AW2(v),WaveGetLaneIndex()^x));} + AW4 AWaveXorW4(AW4 v,AU1 x){return AW4_AU1(WaveReadLaneAt(AU1_AW4(v),WaveGetLaneIndex()^x));} + #endif + #endif +//============================================================================================================================== +#endif +//////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////// +//////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////// +//////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////// +//////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////// +//////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////// +//////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////// +//_____________________________________________________________/\_______________________________________________________________ +//============================================================================================================================== +// +// +// GPU COMMON +// +// +//============================================================================================================================== +#ifdef A_GPU + // Negative and positive infinity. + #define A_INFP_F AF1_AU1(0x7f800000u) + #define A_INFN_F AF1_AU1(0xff800000u) +//------------------------------------------------------------------------------------------------------------------------------ + // Copy sign from 's' to positive 'd'. + AF1 ACpySgnF1(AF1 d,AF1 s){return AF1_AU1(AU1_AF1(d)|(AU1_AF1(s)&AU1_(0x80000000u)));} + AF2 ACpySgnF2(AF2 d,AF2 s){return AF2_AU2(AU2_AF2(d)|(AU2_AF2(s)&AU2_(0x80000000u)));} + AF3 ACpySgnF3(AF3 d,AF3 s){return AF3_AU3(AU3_AF3(d)|(AU3_AF3(s)&AU3_(0x80000000u)));} + AF4 ACpySgnF4(AF4 d,AF4 s){return AF4_AU4(AU4_AF4(d)|(AU4_AF4(s)&AU4_(0x80000000u)));} +//------------------------------------------------------------------------------------------------------------------------------ + // Single operation to return (useful to create a mask to use in lerp for branch free logic), + // m=NaN := 0 + // m>=0 := 0 + // m<0 := 1 + // Uses the following useful floating point logic, + // saturate(+a*(-INF)==-INF) := 0 + // saturate( 0*(-INF)== NaN) := 0 + // saturate(-a*(-INF)==+INF) := 1 + AF1 ASignedF1(AF1 m){return ASatF1(m*AF1_(A_INFN_F));} + AF2 ASignedF2(AF2 m){return ASatF2(m*AF2_(A_INFN_F));} + AF3 ASignedF3(AF3 m){return ASatF3(m*AF3_(A_INFN_F));} + AF4 ASignedF4(AF4 m){return ASatF4(m*AF4_(A_INFN_F));} +//------------------------------------------------------------------------------------------------------------------------------ + AF1 AGtZeroF1(AF1 m){return ASatF1(m*AF1_(A_INFP_F));} + AF2 AGtZeroF2(AF2 m){return ASatF2(m*AF2_(A_INFP_F));} + AF3 AGtZeroF3(AF3 m){return ASatF3(m*AF3_(A_INFP_F));} + AF4 AGtZeroF4(AF4 m){return ASatF4(m*AF4_(A_INFP_F));} +//============================================================================================================================== + #ifdef A_HALF + #ifdef A_HLSL_6_2 + #define A_INFP_H AH1_AW1((uint16_t)0x7c00u) + #define A_INFN_H AH1_AW1((uint16_t)0xfc00u) + #else + #define A_INFP_H AH1_AW1(0x7c00u) + #define A_INFN_H AH1_AW1(0xfc00u) + #endif + +//------------------------------------------------------------------------------------------------------------------------------ + AH1 ACpySgnH1(AH1 d,AH1 s){return AH1_AW1(AW1_AH1(d)|(AW1_AH1(s)&AW1_(0x8000u)));} + AH2 ACpySgnH2(AH2 d,AH2 s){return AH2_AW2(AW2_AH2(d)|(AW2_AH2(s)&AW2_(0x8000u)));} + AH3 ACpySgnH3(AH3 d,AH3 s){return AH3_AW3(AW3_AH3(d)|(AW3_AH3(s)&AW3_(0x8000u)));} + AH4 ACpySgnH4(AH4 d,AH4 s){return AH4_AW4(AW4_AH4(d)|(AW4_AH4(s)&AW4_(0x8000u)));} +//------------------------------------------------------------------------------------------------------------------------------ + AH1 ASignedH1(AH1 m){return ASatH1(m*AH1_(A_INFN_H));} + AH2 ASignedH2(AH2 m){return ASatH2(m*AH2_(A_INFN_H));} + AH3 ASignedH3(AH3 m){return ASatH3(m*AH3_(A_INFN_H));} + AH4 ASignedH4(AH4 m){return ASatH4(m*AH4_(A_INFN_H));} +//------------------------------------------------------------------------------------------------------------------------------ + AH1 AGtZeroH1(AH1 m){return ASatH1(m*AH1_(A_INFP_H));} + AH2 AGtZeroH2(AH2 m){return ASatH2(m*AH2_(A_INFP_H));} + AH3 AGtZeroH3(AH3 m){return ASatH3(m*AH3_(A_INFP_H));} + AH4 AGtZeroH4(AH4 m){return ASatH4(m*AH4_(A_INFP_H));} + #endif +//////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////// +//////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////// +//_____________________________________________________________/\_______________________________________________________________ +//============================================================================================================================== +// [FIS] FLOAT INTEGER SORTABLE +//------------------------------------------------------------------------------------------------------------------------------ +// Float to integer sortable. +// - If sign bit=0, flip the sign bit (positives). +// - If sign bit=1, flip all bits (negatives). +// Integer sortable to float. +// - If sign bit=1, flip the sign bit (positives). +// - If sign bit=0, flip all bits (negatives). +// Has nice side effects. +// - Larger integers are more positive values. +// - Float zero is mapped to center of integers (so clear to integer zero is a nice default for atomic max usage). +// Burns 3 ops for conversion {shift,or,xor}. +//============================================================================================================================== + AU1 AFisToU1(AU1 x){return x^(( AShrSU1(x,AU1_(31)))|AU1_(0x80000000));} + AU1 AFisFromU1(AU1 x){return x^((~AShrSU1(x,AU1_(31)))|AU1_(0x80000000));} +//------------------------------------------------------------------------------------------------------------------------------ + // Just adjust high 16-bit value (useful when upper part of 32-bit word is a 16-bit float value). + AU1 AFisToHiU1(AU1 x){return x^(( AShrSU1(x,AU1_(15)))|AU1_(0x80000000));} + AU1 AFisFromHiU1(AU1 x){return x^((~AShrSU1(x,AU1_(15)))|AU1_(0x80000000));} +//------------------------------------------------------------------------------------------------------------------------------ + #ifdef A_HALF + AW1 AFisToW1(AW1 x){return x^(( AShrSW1(x,AW1_(15)))|AW1_(0x8000));} + AW1 AFisFromW1(AW1 x){return x^((~AShrSW1(x,AW1_(15)))|AW1_(0x8000));} +//------------------------------------------------------------------------------------------------------------------------------ + AW2 AFisToW2(AW2 x){return x^(( AShrSW2(x,AW2_(15)))|AW2_(0x8000));} + AW2 AFisFromW2(AW2 x){return x^((~AShrSW2(x,AW2_(15)))|AW2_(0x8000));} + #endif +//////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////// +//////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////// +//_____________________________________________________________/\_______________________________________________________________ +//============================================================================================================================== +// [PERM] V_PERM_B32 +//------------------------------------------------------------------------------------------------------------------------------ +// Support for V_PERM_B32 started in the 3rd generation of GCN. +//------------------------------------------------------------------------------------------------------------------------------ +// yyyyxxxx - The 'i' input. +// 76543210 +// ======== +// HGFEDCBA - Naming on permutation. +//------------------------------------------------------------------------------------------------------------------------------ +// TODO +// ==== +// - Make sure compiler optimizes this. +//============================================================================================================================== + #ifdef A_HALF + AU1 APerm0E0A(AU2 i){return((i.x )&0xffu)|((i.y<<16)&0xff0000u);} + AU1 APerm0F0B(AU2 i){return((i.x>> 8)&0xffu)|((i.y<< 8)&0xff0000u);} + AU1 APerm0G0C(AU2 i){return((i.x>>16)&0xffu)|((i.y )&0xff0000u);} + AU1 APerm0H0D(AU2 i){return((i.x>>24)&0xffu)|((i.y>> 8)&0xff0000u);} +//------------------------------------------------------------------------------------------------------------------------------ + AU1 APermHGFA(AU2 i){return((i.x )&0x000000ffu)|(i.y&0xffffff00u);} + AU1 APermHGFC(AU2 i){return((i.x>>16)&0x000000ffu)|(i.y&0xffffff00u);} + AU1 APermHGAE(AU2 i){return((i.x<< 8)&0x0000ff00u)|(i.y&0xffff00ffu);} + AU1 APermHGCE(AU2 i){return((i.x>> 8)&0x0000ff00u)|(i.y&0xffff00ffu);} + AU1 APermHAFE(AU2 i){return((i.x<<16)&0x00ff0000u)|(i.y&0xff00ffffu);} + AU1 APermHCFE(AU2 i){return((i.x )&0x00ff0000u)|(i.y&0xff00ffffu);} + AU1 APermAGFE(AU2 i){return((i.x<<24)&0xff000000u)|(i.y&0x00ffffffu);} + AU1 APermCGFE(AU2 i){return((i.x<< 8)&0xff000000u)|(i.y&0x00ffffffu);} +//------------------------------------------------------------------------------------------------------------------------------ + AU1 APermGCEA(AU2 i){return((i.x)&0x00ff00ffu)|((i.y<<8)&0xff00ff00u);} + AU1 APermGECA(AU2 i){return(((i.x)&0xffu)|((i.x>>8)&0xff00u)|((i.y<<16)&0xff0000u)|((i.y<<8)&0xff000000u));} + #endif +//////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////// +//////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////// +//_____________________________________________________________/\_______________________________________________________________ +//============================================================================================================================== +// [BUC] BYTE UNSIGNED CONVERSION +//------------------------------------------------------------------------------------------------------------------------------ +// Designed to use the optimal conversion, enables the scaling to possibly be factored into other computation. +// Works on a range of {0 to A_BUC_<32,16>}, for <32-bit, and 16-bit> respectively. +//------------------------------------------------------------------------------------------------------------------------------ +// OPCODE NOTES +// ============ +// GCN does not do UNORM or SNORM for bytes in opcodes. +// - V_CVT_F32_UBYTE{0,1,2,3} - Unsigned byte to float. +// - V_CVT_PKACC_U8_F32 - Float to unsigned byte (does bit-field insert into 32-bit integer). +// V_PERM_B32 does byte packing with ability to zero fill bytes as well. +// - Can pull out byte values from two sources, and zero fill upper 8-bits of packed hi and lo. +//------------------------------------------------------------------------------------------------------------------------------ +// BYTE : FLOAT - ABuc{0,1,2,3}{To,From}U1() - Designed for V_CVT_F32_UBYTE* and V_CVT_PKACCUM_U8_F32 ops. +// ==== ===== +// 0 : 0 +// 1 : 1 +// ... +// 255 : 255 +// : 256 (just outside the encoding range) +//------------------------------------------------------------------------------------------------------------------------------ +// BYTE : FLOAT - ABuc{0,1,2,3}{To,From}U2() - Designed for 16-bit denormal tricks and V_PERM_B32. +// ==== ===== +// 0 : 0 +// 1 : 1/512 +// 2 : 1/256 +// ... +// 64 : 1/8 +// 128 : 1/4 +// 255 : 255/512 +// : 1/2 (just outside the encoding range) +//------------------------------------------------------------------------------------------------------------------------------ +// OPTIMAL IMPLEMENTATIONS ON AMD ARCHITECTURES +// ============================================ +// r=ABuc0FromU1(i) +// V_CVT_F32_UBYTE0 r,i +// -------------------------------------------- +// r=ABuc0ToU1(d,i) +// V_CVT_PKACCUM_U8_F32 r,i,0,d +// -------------------------------------------- +// d=ABuc0FromU2(i) +// Where 'k0' is an SGPR with 0x0E0A +// Where 'k1' is an SGPR with {32768.0} packed into the lower 16-bits +// V_PERM_B32 d,i.x,i.y,k0 +// V_PK_FMA_F16 d,d,k1.x,0 +// -------------------------------------------- +// r=ABuc0ToU2(d,i) +// Where 'k0' is an SGPR with {1.0/32768.0} packed into the lower 16-bits +// Where 'k1' is an SGPR with 0x???? +// Where 'k2' is an SGPR with 0x???? +// V_PK_FMA_F16 i,i,k0.x,0 +// V_PERM_B32 r.x,i,i,k1 +// V_PERM_B32 r.y,i,i,k2 +//============================================================================================================================== + // Peak range for 32-bit and 16-bit operations. + #define A_BUC_32 (255.0) + #define A_BUC_16 (255.0/512.0) +//============================================================================================================================== + #if 1 + // Designed to be one V_CVT_PKACCUM_U8_F32. + // The extra min is required to pattern match to V_CVT_PKACCUM_U8_F32. + AU1 ABuc0ToU1(AU1 d,AF1 i){return (d&0xffffff00u)|((min(AU1(i),255u) )&(0x000000ffu));} + AU1 ABuc1ToU1(AU1 d,AF1 i){return (d&0xffff00ffu)|((min(AU1(i),255u)<< 8)&(0x0000ff00u));} + AU1 ABuc2ToU1(AU1 d,AF1 i){return (d&0xff00ffffu)|((min(AU1(i),255u)<<16)&(0x00ff0000u));} + AU1 ABuc3ToU1(AU1 d,AF1 i){return (d&0x00ffffffu)|((min(AU1(i),255u)<<24)&(0xff000000u));} +//------------------------------------------------------------------------------------------------------------------------------ + // Designed to be one V_CVT_F32_UBYTE*. + AF1 ABuc0FromU1(AU1 i){return AF1((i )&255u);} + AF1 ABuc1FromU1(AU1 i){return AF1((i>> 8)&255u);} + AF1 ABuc2FromU1(AU1 i){return AF1((i>>16)&255u);} + AF1 ABuc3FromU1(AU1 i){return AF1((i>>24)&255u);} + #endif +//============================================================================================================================== + #ifdef A_HALF + // Takes {x0,x1} and {y0,y1} and builds {{x0,y0},{x1,y1}}. + AW2 ABuc01ToW2(AH2 x,AH2 y){x*=AH2_(1.0/32768.0);y*=AH2_(1.0/32768.0); + return AW2_AU1(APermGCEA(AU2(AU1_AW2(AW2_AH2(x)),AU1_AW2(AW2_AH2(y)))));} +//------------------------------------------------------------------------------------------------------------------------------ + // Designed for 3 ops to do SOA to AOS and conversion. + AU2 ABuc0ToU2(AU2 d,AH2 i){AU1 b=AU1_AW2(AW2_AH2(i*AH2_(1.0/32768.0))); + return AU2(APermHGFA(AU2(d.x,b)),APermHGFC(AU2(d.y,b)));} + AU2 ABuc1ToU2(AU2 d,AH2 i){AU1 b=AU1_AW2(AW2_AH2(i*AH2_(1.0/32768.0))); + return AU2(APermHGAE(AU2(d.x,b)),APermHGCE(AU2(d.y,b)));} + AU2 ABuc2ToU2(AU2 d,AH2 i){AU1 b=AU1_AW2(AW2_AH2(i*AH2_(1.0/32768.0))); + return AU2(APermHAFE(AU2(d.x,b)),APermHCFE(AU2(d.y,b)));} + AU2 ABuc3ToU2(AU2 d,AH2 i){AU1 b=AU1_AW2(AW2_AH2(i*AH2_(1.0/32768.0))); + return AU2(APermAGFE(AU2(d.x,b)),APermCGFE(AU2(d.y,b)));} +//------------------------------------------------------------------------------------------------------------------------------ + // Designed for 2 ops to do both AOS to SOA, and conversion. + AH2 ABuc0FromU2(AU2 i){return AH2_AW2(AW2_AU1(APerm0E0A(i)))*AH2_(32768.0);} + AH2 ABuc1FromU2(AU2 i){return AH2_AW2(AW2_AU1(APerm0F0B(i)))*AH2_(32768.0);} + AH2 ABuc2FromU2(AU2 i){return AH2_AW2(AW2_AU1(APerm0G0C(i)))*AH2_(32768.0);} + AH2 ABuc3FromU2(AU2 i){return AH2_AW2(AW2_AU1(APerm0H0D(i)))*AH2_(32768.0);} + #endif +//////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////// +//////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////// +//_____________________________________________________________/\_______________________________________________________________ +//============================================================================================================================== +// [BSC] BYTE SIGNED CONVERSION +//------------------------------------------------------------------------------------------------------------------------------ +// Similar to [BUC]. +// Works on a range of {-/+ A_BSC_<32,16>}, for <32-bit, and 16-bit> respectively. +//------------------------------------------------------------------------------------------------------------------------------ +// ENCODING (without zero-based encoding) +// ======== +// 0 = unused (can be used to mean something else) +// 1 = lowest value +// 128 = exact zero center (zero based encoding +// 255 = highest value +//------------------------------------------------------------------------------------------------------------------------------ +// Zero-based [Zb] flips the MSB bit of the byte (making 128 "exact zero" actually zero). +// This is useful if there is a desire for cleared values to decode as zero. +//------------------------------------------------------------------------------------------------------------------------------ +// BYTE : FLOAT - ABsc{0,1,2,3}{To,From}U2() - Designed for 16-bit denormal tricks and V_PERM_B32. +// ==== ===== +// 0 : -127/512 (unused) +// 1 : -126/512 +// 2 : -125/512 +// ... +// 128 : 0 +// ... +// 255 : 127/512 +// : 1/4 (just outside the encoding range) +//============================================================================================================================== + // Peak range for 32-bit and 16-bit operations. + #define A_BSC_32 (127.0) + #define A_BSC_16 (127.0/512.0) +//============================================================================================================================== + #if 1 + AU1 ABsc0ToU1(AU1 d,AF1 i){return (d&0xffffff00u)|((min(AU1(i+128.0),255u) )&(0x000000ffu));} + AU1 ABsc1ToU1(AU1 d,AF1 i){return (d&0xffff00ffu)|((min(AU1(i+128.0),255u)<< 8)&(0x0000ff00u));} + AU1 ABsc2ToU1(AU1 d,AF1 i){return (d&0xff00ffffu)|((min(AU1(i+128.0),255u)<<16)&(0x00ff0000u));} + AU1 ABsc3ToU1(AU1 d,AF1 i){return (d&0x00ffffffu)|((min(AU1(i+128.0),255u)<<24)&(0xff000000u));} +//------------------------------------------------------------------------------------------------------------------------------ + AU1 ABsc0ToZbU1(AU1 d,AF1 i){return ((d&0xffffff00u)|((min(AU1(trunc(i)+128.0),255u) )&(0x000000ffu)))^0x00000080u;} + AU1 ABsc1ToZbU1(AU1 d,AF1 i){return ((d&0xffff00ffu)|((min(AU1(trunc(i)+128.0),255u)<< 8)&(0x0000ff00u)))^0x00008000u;} + AU1 ABsc2ToZbU1(AU1 d,AF1 i){return ((d&0xff00ffffu)|((min(AU1(trunc(i)+128.0),255u)<<16)&(0x00ff0000u)))^0x00800000u;} + AU1 ABsc3ToZbU1(AU1 d,AF1 i){return ((d&0x00ffffffu)|((min(AU1(trunc(i)+128.0),255u)<<24)&(0xff000000u)))^0x80000000u;} +//------------------------------------------------------------------------------------------------------------------------------ + AF1 ABsc0FromU1(AU1 i){return AF1((i )&255u)-128.0;} + AF1 ABsc1FromU1(AU1 i){return AF1((i>> 8)&255u)-128.0;} + AF1 ABsc2FromU1(AU1 i){return AF1((i>>16)&255u)-128.0;} + AF1 ABsc3FromU1(AU1 i){return AF1((i>>24)&255u)-128.0;} +//------------------------------------------------------------------------------------------------------------------------------ + AF1 ABsc0FromZbU1(AU1 i){return AF1(((i )&255u)^0x80u)-128.0;} + AF1 ABsc1FromZbU1(AU1 i){return AF1(((i>> 8)&255u)^0x80u)-128.0;} + AF1 ABsc2FromZbU1(AU1 i){return AF1(((i>>16)&255u)^0x80u)-128.0;} + AF1 ABsc3FromZbU1(AU1 i){return AF1(((i>>24)&255u)^0x80u)-128.0;} + #endif +//============================================================================================================================== + #ifdef A_HALF + // Takes {x0,x1} and {y0,y1} and builds {{x0,y0},{x1,y1}}. + AW2 ABsc01ToW2(AH2 x,AH2 y){x=x*AH2_(1.0/32768.0)+AH2_(0.25/32768.0);y=y*AH2_(1.0/32768.0)+AH2_(0.25/32768.0); + return AW2_AU1(APermGCEA(AU2(AU1_AW2(AW2_AH2(x)),AU1_AW2(AW2_AH2(y)))));} +//------------------------------------------------------------------------------------------------------------------------------ + AU2 ABsc0ToU2(AU2 d,AH2 i){AU1 b=AU1_AW2(AW2_AH2(i*AH2_(1.0/32768.0)+AH2_(0.25/32768.0))); + return AU2(APermHGFA(AU2(d.x,b)),APermHGFC(AU2(d.y,b)));} + AU2 ABsc1ToU2(AU2 d,AH2 i){AU1 b=AU1_AW2(AW2_AH2(i*AH2_(1.0/32768.0)+AH2_(0.25/32768.0))); + return AU2(APermHGAE(AU2(d.x,b)),APermHGCE(AU2(d.y,b)));} + AU2 ABsc2ToU2(AU2 d,AH2 i){AU1 b=AU1_AW2(AW2_AH2(i*AH2_(1.0/32768.0)+AH2_(0.25/32768.0))); + return AU2(APermHAFE(AU2(d.x,b)),APermHCFE(AU2(d.y,b)));} + AU2 ABsc3ToU2(AU2 d,AH2 i){AU1 b=AU1_AW2(AW2_AH2(i*AH2_(1.0/32768.0)+AH2_(0.25/32768.0))); + return AU2(APermAGFE(AU2(d.x,b)),APermCGFE(AU2(d.y,b)));} +//------------------------------------------------------------------------------------------------------------------------------ + AU2 ABsc0ToZbU2(AU2 d,AH2 i){AU1 b=AU1_AW2(AW2_AH2(i*AH2_(1.0/32768.0)+AH2_(0.25/32768.0)))^0x00800080u; + return AU2(APermHGFA(AU2(d.x,b)),APermHGFC(AU2(d.y,b)));} + AU2 ABsc1ToZbU2(AU2 d,AH2 i){AU1 b=AU1_AW2(AW2_AH2(i*AH2_(1.0/32768.0)+AH2_(0.25/32768.0)))^0x00800080u; + return AU2(APermHGAE(AU2(d.x,b)),APermHGCE(AU2(d.y,b)));} + AU2 ABsc2ToZbU2(AU2 d,AH2 i){AU1 b=AU1_AW2(AW2_AH2(i*AH2_(1.0/32768.0)+AH2_(0.25/32768.0)))^0x00800080u; + return AU2(APermHAFE(AU2(d.x,b)),APermHCFE(AU2(d.y,b)));} + AU2 ABsc3ToZbU2(AU2 d,AH2 i){AU1 b=AU1_AW2(AW2_AH2(i*AH2_(1.0/32768.0)+AH2_(0.25/32768.0)))^0x00800080u; + return AU2(APermAGFE(AU2(d.x,b)),APermCGFE(AU2(d.y,b)));} +//------------------------------------------------------------------------------------------------------------------------------ + AH2 ABsc0FromU2(AU2 i){return AH2_AW2(AW2_AU1(APerm0E0A(i)))*AH2_(32768.0)-AH2_(0.25);} + AH2 ABsc1FromU2(AU2 i){return AH2_AW2(AW2_AU1(APerm0F0B(i)))*AH2_(32768.0)-AH2_(0.25);} + AH2 ABsc2FromU2(AU2 i){return AH2_AW2(AW2_AU1(APerm0G0C(i)))*AH2_(32768.0)-AH2_(0.25);} + AH2 ABsc3FromU2(AU2 i){return AH2_AW2(AW2_AU1(APerm0H0D(i)))*AH2_(32768.0)-AH2_(0.25);} +//------------------------------------------------------------------------------------------------------------------------------ + AH2 ABsc0FromZbU2(AU2 i){return AH2_AW2(AW2_AU1(APerm0E0A(i)^0x00800080u))*AH2_(32768.0)-AH2_(0.25);} + AH2 ABsc1FromZbU2(AU2 i){return AH2_AW2(AW2_AU1(APerm0F0B(i)^0x00800080u))*AH2_(32768.0)-AH2_(0.25);} + AH2 ABsc2FromZbU2(AU2 i){return AH2_AW2(AW2_AU1(APerm0G0C(i)^0x00800080u))*AH2_(32768.0)-AH2_(0.25);} + AH2 ABsc3FromZbU2(AU2 i){return AH2_AW2(AW2_AU1(APerm0H0D(i)^0x00800080u))*AH2_(32768.0)-AH2_(0.25);} + #endif +//////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////// +//////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////// +//_____________________________________________________________/\_______________________________________________________________ +//============================================================================================================================== +// HALF APPROXIMATIONS +//------------------------------------------------------------------------------------------------------------------------------ +// These support only positive inputs. +// Did not see value yet in specialization for range. +// Using quick testing, ended up mostly getting the same "best" approximation for various ranges. +// With hardware that can co-execute transcendentals, the value in approximations could be less than expected. +// However from a latency perspective, if execution of a transcendental is 4 clk, with no packed support, -> 8 clk total. +// And co-execution would require a compiler interleaving a lot of independent work for packed usage. +//------------------------------------------------------------------------------------------------------------------------------ +// The one Newton Raphson iteration form of rsq() was skipped (requires 6 ops total). +// Same with sqrt(), as this could be x*rsq() (7 ops). +//============================================================================================================================== + #ifdef A_HALF + // Minimize squared error across full positive range, 2 ops. + // The 0x1de2 based approximation maps {0 to 1} input maps to < 1 output. + AH1 APrxLoSqrtH1(AH1 a){return AH1_AW1((AW1_AH1(a)>>AW1_(1))+AW1_(0x1de2));} + AH2 APrxLoSqrtH2(AH2 a){return AH2_AW2((AW2_AH2(a)>>AW2_(1))+AW2_(0x1de2));} + AH3 APrxLoSqrtH3(AH3 a){return AH3_AW3((AW3_AH3(a)>>AW3_(1))+AW3_(0x1de2));} + AH4 APrxLoSqrtH4(AH4 a){return AH4_AW4((AW4_AH4(a)>>AW4_(1))+AW4_(0x1de2));} +//------------------------------------------------------------------------------------------------------------------------------ + // Lower precision estimation, 1 op. + // Minimize squared error across {smallest normal to 16384.0}. + AH1 APrxLoRcpH1(AH1 a){return AH1_AW1(AW1_(0x7784)-AW1_AH1(a));} + AH2 APrxLoRcpH2(AH2 a){return AH2_AW2(AW2_(0x7784)-AW2_AH2(a));} + AH3 APrxLoRcpH3(AH3 a){return AH3_AW3(AW3_(0x7784)-AW3_AH3(a));} + AH4 APrxLoRcpH4(AH4 a){return AH4_AW4(AW4_(0x7784)-AW4_AH4(a));} +//------------------------------------------------------------------------------------------------------------------------------ + // Medium precision estimation, one Newton Raphson iteration, 3 ops. + AH1 APrxMedRcpH1(AH1 a){AH1 b=AH1_AW1(AW1_(0x778d)-AW1_AH1(a));return b*(-b*a+AH1_(2.0));} + AH2 APrxMedRcpH2(AH2 a){AH2 b=AH2_AW2(AW2_(0x778d)-AW2_AH2(a));return b*(-b*a+AH2_(2.0));} + AH3 APrxMedRcpH3(AH3 a){AH3 b=AH3_AW3(AW3_(0x778d)-AW3_AH3(a));return b*(-b*a+AH3_(2.0));} + AH4 APrxMedRcpH4(AH4 a){AH4 b=AH4_AW4(AW4_(0x778d)-AW4_AH4(a));return b*(-b*a+AH4_(2.0));} +//------------------------------------------------------------------------------------------------------------------------------ + // Minimize squared error across {smallest normal to 16384.0}, 2 ops. + AH1 APrxLoRsqH1(AH1 a){return AH1_AW1(AW1_(0x59a3)-(AW1_AH1(a)>>AW1_(1)));} + AH2 APrxLoRsqH2(AH2 a){return AH2_AW2(AW2_(0x59a3)-(AW2_AH2(a)>>AW2_(1)));} + AH3 APrxLoRsqH3(AH3 a){return AH3_AW3(AW3_(0x59a3)-(AW3_AH3(a)>>AW3_(1)));} + AH4 APrxLoRsqH4(AH4 a){return AH4_AW4(AW4_(0x59a3)-(AW4_AH4(a)>>AW4_(1)));} + #endif +//////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////// +//////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////// +//_____________________________________________________________/\_______________________________________________________________ +//============================================================================================================================== +// FLOAT APPROXIMATIONS +//------------------------------------------------------------------------------------------------------------------------------ +// Michal Drobot has an excellent presentation on these: "Low Level Optimizations For GCN", +// - Idea dates back to SGI, then to Quake 3, etc. +// - https://michaldrobot.files.wordpress.com/2014/05/gcn_alu_opt_digitaldragons2014.pdf +// - sqrt(x)=rsqrt(x)*x +// - rcp(x)=rsqrt(x)*rsqrt(x) for positive x +// - https://github.com/michaldrobot/ShaderFastLibs/blob/master/ShaderFastMathLib.h +//------------------------------------------------------------------------------------------------------------------------------ +// These below are from perhaps less complete searching for optimal. +// Used FP16 normal range for testing with +4096 32-bit step size for sampling error. +// So these match up well with the half approximations. +//============================================================================================================================== + AF1 APrxLoSqrtF1(AF1 a){return AF1_AU1((AU1_AF1(a)>>AU1_(1))+AU1_(0x1fbc4639));} + AF1 APrxLoRcpF1(AF1 a){return AF1_AU1(AU1_(0x7ef07ebb)-AU1_AF1(a));} + AF1 APrxMedRcpF1(AF1 a){AF1 b=AF1_AU1(AU1_(0x7ef19fff)-AU1_AF1(a));return b*(-b*a+AF1_(2.0));} + AF1 APrxLoRsqF1(AF1 a){return AF1_AU1(AU1_(0x5f347d74)-(AU1_AF1(a)>>AU1_(1)));} +//------------------------------------------------------------------------------------------------------------------------------ + AF2 APrxLoSqrtF2(AF2 a){return AF2_AU2((AU2_AF2(a)>>AU2_(1))+AU2_(0x1fbc4639));} + AF2 APrxLoRcpF2(AF2 a){return AF2_AU2(AU2_(0x7ef07ebb)-AU2_AF2(a));} + AF2 APrxMedRcpF2(AF2 a){AF2 b=AF2_AU2(AU2_(0x7ef19fff)-AU2_AF2(a));return b*(-b*a+AF2_(2.0));} + AF2 APrxLoRsqF2(AF2 a){return AF2_AU2(AU2_(0x5f347d74)-(AU2_AF2(a)>>AU2_(1)));} +//------------------------------------------------------------------------------------------------------------------------------ + AF3 APrxLoSqrtF3(AF3 a){return AF3_AU3((AU3_AF3(a)>>AU3_(1))+AU3_(0x1fbc4639));} + AF3 APrxLoRcpF3(AF3 a){return AF3_AU3(AU3_(0x7ef07ebb)-AU3_AF3(a));} + AF3 APrxMedRcpF3(AF3 a){AF3 b=AF3_AU3(AU3_(0x7ef19fff)-AU3_AF3(a));return b*(-b*a+AF3_(2.0));} + AF3 APrxLoRsqF3(AF3 a){return AF3_AU3(AU3_(0x5f347d74)-(AU3_AF3(a)>>AU3_(1)));} +//------------------------------------------------------------------------------------------------------------------------------ + AF4 APrxLoSqrtF4(AF4 a){return AF4_AU4((AU4_AF4(a)>>AU4_(1))+AU4_(0x1fbc4639));} + AF4 APrxLoRcpF4(AF4 a){return AF4_AU4(AU4_(0x7ef07ebb)-AU4_AF4(a));} + AF4 APrxMedRcpF4(AF4 a){AF4 b=AF4_AU4(AU4_(0x7ef19fff)-AU4_AF4(a));return b*(-b*a+AF4_(2.0));} + AF4 APrxLoRsqF4(AF4 a){return AF4_AU4(AU4_(0x5f347d74)-(AU4_AF4(a)>>AU4_(1)));} +//////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////// +//////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////// +//_____________________________________________________________/\_______________________________________________________________ +//============================================================================================================================== +// PQ APPROXIMATIONS +//------------------------------------------------------------------------------------------------------------------------------ +// PQ is very close to x^(1/8). The functions below Use the fast float approximation method to do +// PQ<~>Gamma2 (4th power and fast 4th root) and PQ<~>Linear (8th power and fast 8th root). Maximum error is ~0.2%. +//============================================================================================================================== +// Helpers + AF1 Quart(AF1 a) { a = a * a; return a * a;} + AF1 Oct(AF1 a) { a = a * a; a = a * a; return a * a; } + AF2 Quart(AF2 a) { a = a * a; return a * a; } + AF2 Oct(AF2 a) { a = a * a; a = a * a; return a * a; } + AF3 Quart(AF3 a) { a = a * a; return a * a; } + AF3 Oct(AF3 a) { a = a * a; a = a * a; return a * a; } + AF4 Quart(AF4 a) { a = a * a; return a * a; } + AF4 Oct(AF4 a) { a = a * a; a = a * a; return a * a; } + //------------------------------------------------------------------------------------------------------------------------------ + AF1 APrxPQToGamma2(AF1 a) { return Quart(a); } + AF1 APrxPQToLinear(AF1 a) { return Oct(a); } + AF1 APrxLoGamma2ToPQ(AF1 a) { return AF1_AU1((AU1_AF1(a) >> AU1_(2)) + AU1_(0x2F9A4E46)); } + AF1 APrxMedGamma2ToPQ(AF1 a) { AF1 b = AF1_AU1((AU1_AF1(a) >> AU1_(2)) + AU1_(0x2F9A4E46)); AF1 b4 = Quart(b); return b - b * (b4 - a) / (AF1_(4.0) * b4); } + AF1 APrxHighGamma2ToPQ(AF1 a) { return sqrt(sqrt(a)); } + AF1 APrxLoLinearToPQ(AF1 a) { return AF1_AU1((AU1_AF1(a) >> AU1_(3)) + AU1_(0x378D8723)); } + AF1 APrxMedLinearToPQ(AF1 a) { AF1 b = AF1_AU1((AU1_AF1(a) >> AU1_(3)) + AU1_(0x378D8723)); AF1 b8 = Oct(b); return b - b * (b8 - a) / (AF1_(8.0) * b8); } + AF1 APrxHighLinearToPQ(AF1 a) { return sqrt(sqrt(sqrt(a))); } + //------------------------------------------------------------------------------------------------------------------------------ + AF2 APrxPQToGamma2(AF2 a) { return Quart(a); } + AF2 APrxPQToLinear(AF2 a) { return Oct(a); } + AF2 APrxLoGamma2ToPQ(AF2 a) { return AF2_AU2((AU2_AF2(a) >> AU2_(2)) + AU2_(0x2F9A4E46)); } + AF2 APrxMedGamma2ToPQ(AF2 a) { AF2 b = AF2_AU2((AU2_AF2(a) >> AU2_(2)) + AU2_(0x2F9A4E46)); AF2 b4 = Quart(b); return b - b * (b4 - a) / (AF1_(4.0) * b4); } + AF2 APrxHighGamma2ToPQ(AF2 a) { return sqrt(sqrt(a)); } + AF2 APrxLoLinearToPQ(AF2 a) { return AF2_AU2((AU2_AF2(a) >> AU2_(3)) + AU2_(0x378D8723)); } + AF2 APrxMedLinearToPQ(AF2 a) { AF2 b = AF2_AU2((AU2_AF2(a) >> AU2_(3)) + AU2_(0x378D8723)); AF2 b8 = Oct(b); return b - b * (b8 - a) / (AF1_(8.0) * b8); } + AF2 APrxHighLinearToPQ(AF2 a) { return sqrt(sqrt(sqrt(a))); } + //------------------------------------------------------------------------------------------------------------------------------ + AF3 APrxPQToGamma2(AF3 a) { return Quart(a); } + AF3 APrxPQToLinear(AF3 a) { return Oct(a); } + AF3 APrxLoGamma2ToPQ(AF3 a) { return AF3_AU3((AU3_AF3(a) >> AU3_(2)) + AU3_(0x2F9A4E46)); } + AF3 APrxMedGamma2ToPQ(AF3 a) { AF3 b = AF3_AU3((AU3_AF3(a) >> AU3_(2)) + AU3_(0x2F9A4E46)); AF3 b4 = Quart(b); return b - b * (b4 - a) / (AF1_(4.0) * b4); } + AF3 APrxHighGamma2ToPQ(AF3 a) { return sqrt(sqrt(a)); } + AF3 APrxLoLinearToPQ(AF3 a) { return AF3_AU3((AU3_AF3(a) >> AU3_(3)) + AU3_(0x378D8723)); } + AF3 APrxMedLinearToPQ(AF3 a) { AF3 b = AF3_AU3((AU3_AF3(a) >> AU3_(3)) + AU3_(0x378D8723)); AF3 b8 = Oct(b); return b - b * (b8 - a) / (AF1_(8.0) * b8); } + AF3 APrxHighLinearToPQ(AF3 a) { return sqrt(sqrt(sqrt(a))); } + //------------------------------------------------------------------------------------------------------------------------------ + AF4 APrxPQToGamma2(AF4 a) { return Quart(a); } + AF4 APrxPQToLinear(AF4 a) { return Oct(a); } + AF4 APrxLoGamma2ToPQ(AF4 a) { return AF4_AU4((AU4_AF4(a) >> AU4_(2)) + AU4_(0x2F9A4E46)); } + AF4 APrxMedGamma2ToPQ(AF4 a) { AF4 b = AF4_AU4((AU4_AF4(a) >> AU4_(2)) + AU4_(0x2F9A4E46)); AF4 b4 = Quart(b); return b - b * (b4 - a) / (AF1_(4.0) * b4); } + AF4 APrxHighGamma2ToPQ(AF4 a) { return sqrt(sqrt(a)); } + AF4 APrxLoLinearToPQ(AF4 a) { return AF4_AU4((AU4_AF4(a) >> AU4_(3)) + AU4_(0x378D8723)); } + AF4 APrxMedLinearToPQ(AF4 a) { AF4 b = AF4_AU4((AU4_AF4(a) >> AU4_(3)) + AU4_(0x378D8723)); AF4 b8 = Oct(b); return b - b * (b8 - a) / (AF1_(8.0) * b8); } + AF4 APrxHighLinearToPQ(AF4 a) { return sqrt(sqrt(sqrt(a))); } +//////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////// +//////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////// +//_____________________________________________________________/\_______________________________________________________________ +//============================================================================================================================== +// PARABOLIC SIN & COS +//------------------------------------------------------------------------------------------------------------------------------ +// Approximate answers to transcendental questions. +//------------------------------------------------------------------------------------------------------------------------------ +//============================================================================================================================== + #if 1 + // Valid input range is {-1 to 1} representing {0 to 2 pi}. + // Output range is {-1/4 to 1/4} representing {-1 to 1}. + AF1 APSinF1(AF1 x){return x*abs(x)-x;} // MAD. + AF2 APSinF2(AF2 x){return x*abs(x)-x;} + AF1 APCosF1(AF1 x){x=AFractF1(x*AF1_(0.5)+AF1_(0.75));x=x*AF1_(2.0)-AF1_(1.0);return APSinF1(x);} // 3x MAD, FRACT + AF2 APCosF2(AF2 x){x=AFractF2(x*AF2_(0.5)+AF2_(0.75));x=x*AF2_(2.0)-AF2_(1.0);return APSinF2(x);} + AF2 APSinCosF1(AF1 x){AF1 y=AFractF1(x*AF1_(0.5)+AF1_(0.75));y=y*AF1_(2.0)-AF1_(1.0);return APSinF2(AF2(x,y));} + #endif +//------------------------------------------------------------------------------------------------------------------------------ + #ifdef A_HALF + // For a packed {sin,cos} pair, + // - Native takes 16 clocks and 4 issue slots (no packed transcendentals). + // - Parabolic takes 8 clocks and 8 issue slots (only fract is non-packed). + AH1 APSinH1(AH1 x){return x*abs(x)-x;} + AH2 APSinH2(AH2 x){return x*abs(x)-x;} // AND,FMA + AH1 APCosH1(AH1 x){x=AFractH1(x*AH1_(0.5)+AH1_(0.75));x=x*AH1_(2.0)-AH1_(1.0);return APSinH1(x);} + AH2 APCosH2(AH2 x){x=AFractH2(x*AH2_(0.5)+AH2_(0.75));x=x*AH2_(2.0)-AH2_(1.0);return APSinH2(x);} // 3x FMA, 2xFRACT, AND + AH2 APSinCosH1(AH1 x){AH1 y=AFractH1(x*AH1_(0.5)+AH1_(0.75));y=y*AH1_(2.0)-AH1_(1.0);return APSinH2(AH2(x,y));} + #endif +//////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////// +//////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////// +//_____________________________________________________________/\_______________________________________________________________ +//============================================================================================================================== +// [ZOL] ZERO ONE LOGIC +//------------------------------------------------------------------------------------------------------------------------------ +// Conditional free logic designed for easy 16-bit packing, and backwards porting to 32-bit. +//------------------------------------------------------------------------------------------------------------------------------ +// 0 := false +// 1 := true +//------------------------------------------------------------------------------------------------------------------------------ +// AndNot(x,y) -> !(x&y) .... One op. +// AndOr(x,y,z) -> (x&y)|z ... One op. +// GtZero(x) -> x>0.0 ..... One op. +// Sel(x,y,z) -> x?y:z ..... Two ops, has no precision loss. +// Signed(x) -> x<0.0 ..... One op. +// ZeroPass(x,y) -> x?0:y ..... Two ops, 'y' is a pass through safe for aliasing as integer. +//------------------------------------------------------------------------------------------------------------------------------ +// OPTIMIZATION NOTES +// ================== +// - On Vega to use 2 constants in a packed op, pass in as one AW2 or one AH2 'k.xy' and use as 'k.xx' and 'k.yy'. +// For example 'a.xy*k.xx+k.yy'. +//============================================================================================================================== + #if 1 + AU1 AZolAndU1(AU1 x,AU1 y){return min(x,y);} + AU2 AZolAndU2(AU2 x,AU2 y){return min(x,y);} + AU3 AZolAndU3(AU3 x,AU3 y){return min(x,y);} + AU4 AZolAndU4(AU4 x,AU4 y){return min(x,y);} +//------------------------------------------------------------------------------------------------------------------------------ + AU1 AZolNotU1(AU1 x){return x^AU1_(1);} + AU2 AZolNotU2(AU2 x){return x^AU2_(1);} + AU3 AZolNotU3(AU3 x){return x^AU3_(1);} + AU4 AZolNotU4(AU4 x){return x^AU4_(1);} +//------------------------------------------------------------------------------------------------------------------------------ + AU1 AZolOrU1(AU1 x,AU1 y){return max(x,y);} + AU2 AZolOrU2(AU2 x,AU2 y){return max(x,y);} + AU3 AZolOrU3(AU3 x,AU3 y){return max(x,y);} + AU4 AZolOrU4(AU4 x,AU4 y){return max(x,y);} +//============================================================================================================================== + AU1 AZolF1ToU1(AF1 x){return AU1(x);} + AU2 AZolF2ToU2(AF2 x){return AU2(x);} + AU3 AZolF3ToU3(AF3 x){return AU3(x);} + AU4 AZolF4ToU4(AF4 x){return AU4(x);} +//------------------------------------------------------------------------------------------------------------------------------ + // 2 ops, denormals don't work in 32-bit on PC (and if they are enabled, OMOD is disabled). + AU1 AZolNotF1ToU1(AF1 x){return AU1(AF1_(1.0)-x);} + AU2 AZolNotF2ToU2(AF2 x){return AU2(AF2_(1.0)-x);} + AU3 AZolNotF3ToU3(AF3 x){return AU3(AF3_(1.0)-x);} + AU4 AZolNotF4ToU4(AF4 x){return AU4(AF4_(1.0)-x);} +//------------------------------------------------------------------------------------------------------------------------------ + AF1 AZolU1ToF1(AU1 x){return AF1(x);} + AF2 AZolU2ToF2(AU2 x){return AF2(x);} + AF3 AZolU3ToF3(AU3 x){return AF3(x);} + AF4 AZolU4ToF4(AU4 x){return AF4(x);} +//============================================================================================================================== + AF1 AZolAndF1(AF1 x,AF1 y){return min(x,y);} + AF2 AZolAndF2(AF2 x,AF2 y){return min(x,y);} + AF3 AZolAndF3(AF3 x,AF3 y){return min(x,y);} + AF4 AZolAndF4(AF4 x,AF4 y){return min(x,y);} +//------------------------------------------------------------------------------------------------------------------------------ + AF1 ASolAndNotF1(AF1 x,AF1 y){return (-x)*y+AF1_(1.0);} + AF2 ASolAndNotF2(AF2 x,AF2 y){return (-x)*y+AF2_(1.0);} + AF3 ASolAndNotF3(AF3 x,AF3 y){return (-x)*y+AF3_(1.0);} + AF4 ASolAndNotF4(AF4 x,AF4 y){return (-x)*y+AF4_(1.0);} +//------------------------------------------------------------------------------------------------------------------------------ + AF1 AZolAndOrF1(AF1 x,AF1 y,AF1 z){return ASatF1(x*y+z);} + AF2 AZolAndOrF2(AF2 x,AF2 y,AF2 z){return ASatF2(x*y+z);} + AF3 AZolAndOrF3(AF3 x,AF3 y,AF3 z){return ASatF3(x*y+z);} + AF4 AZolAndOrF4(AF4 x,AF4 y,AF4 z){return ASatF4(x*y+z);} +//------------------------------------------------------------------------------------------------------------------------------ + AF1 AZolGtZeroF1(AF1 x){return ASatF1(x*AF1_(A_INFP_F));} + AF2 AZolGtZeroF2(AF2 x){return ASatF2(x*AF2_(A_INFP_F));} + AF3 AZolGtZeroF3(AF3 x){return ASatF3(x*AF3_(A_INFP_F));} + AF4 AZolGtZeroF4(AF4 x){return ASatF4(x*AF4_(A_INFP_F));} +//------------------------------------------------------------------------------------------------------------------------------ + AF1 AZolNotF1(AF1 x){return AF1_(1.0)-x;} + AF2 AZolNotF2(AF2 x){return AF2_(1.0)-x;} + AF3 AZolNotF3(AF3 x){return AF3_(1.0)-x;} + AF4 AZolNotF4(AF4 x){return AF4_(1.0)-x;} +//------------------------------------------------------------------------------------------------------------------------------ + AF1 AZolOrF1(AF1 x,AF1 y){return max(x,y);} + AF2 AZolOrF2(AF2 x,AF2 y){return max(x,y);} + AF3 AZolOrF3(AF3 x,AF3 y){return max(x,y);} + AF4 AZolOrF4(AF4 x,AF4 y){return max(x,y);} +//------------------------------------------------------------------------------------------------------------------------------ + AF1 AZolSelF1(AF1 x,AF1 y,AF1 z){AF1 r=(-x)*z+z;return x*y+r;} + AF2 AZolSelF2(AF2 x,AF2 y,AF2 z){AF2 r=(-x)*z+z;return x*y+r;} + AF3 AZolSelF3(AF3 x,AF3 y,AF3 z){AF3 r=(-x)*z+z;return x*y+r;} + AF4 AZolSelF4(AF4 x,AF4 y,AF4 z){AF4 r=(-x)*z+z;return x*y+r;} +//------------------------------------------------------------------------------------------------------------------------------ + AF1 AZolSignedF1(AF1 x){return ASatF1(x*AF1_(A_INFN_F));} + AF2 AZolSignedF2(AF2 x){return ASatF2(x*AF2_(A_INFN_F));} + AF3 AZolSignedF3(AF3 x){return ASatF3(x*AF3_(A_INFN_F));} + AF4 AZolSignedF4(AF4 x){return ASatF4(x*AF4_(A_INFN_F));} +//------------------------------------------------------------------------------------------------------------------------------ + AF1 AZolZeroPassF1(AF1 x,AF1 y){return AF1_AU1((AU1_AF1(x)!=AU1_(0))?AU1_(0):AU1_AF1(y));} + AF2 AZolZeroPassF2(AF2 x,AF2 y){return AF2_AU2((AU2_AF2(x)!=AU2_(0))?AU2_(0):AU2_AF2(y));} + AF3 AZolZeroPassF3(AF3 x,AF3 y){return AF3_AU3((AU3_AF3(x)!=AU3_(0))?AU3_(0):AU3_AF3(y));} + AF4 AZolZeroPassF4(AF4 x,AF4 y){return AF4_AU4((AU4_AF4(x)!=AU4_(0))?AU4_(0):AU4_AF4(y));} + #endif +//============================================================================================================================== + #ifdef A_HALF + AW1 AZolAndW1(AW1 x,AW1 y){return min(x,y);} + AW2 AZolAndW2(AW2 x,AW2 y){return min(x,y);} + AW3 AZolAndW3(AW3 x,AW3 y){return min(x,y);} + AW4 AZolAndW4(AW4 x,AW4 y){return min(x,y);} +//------------------------------------------------------------------------------------------------------------------------------ + AW1 AZolNotW1(AW1 x){return x^AW1_(1);} + AW2 AZolNotW2(AW2 x){return x^AW2_(1);} + AW3 AZolNotW3(AW3 x){return x^AW3_(1);} + AW4 AZolNotW4(AW4 x){return x^AW4_(1);} +//------------------------------------------------------------------------------------------------------------------------------ + AW1 AZolOrW1(AW1 x,AW1 y){return max(x,y);} + AW2 AZolOrW2(AW2 x,AW2 y){return max(x,y);} + AW3 AZolOrW3(AW3 x,AW3 y){return max(x,y);} + AW4 AZolOrW4(AW4 x,AW4 y){return max(x,y);} +//============================================================================================================================== + // Uses denormal trick. + AW1 AZolH1ToW1(AH1 x){return AW1_AH1(x*AH1_AW1(AW1_(1)));} + AW2 AZolH2ToW2(AH2 x){return AW2_AH2(x*AH2_AW2(AW2_(1)));} + AW3 AZolH3ToW3(AH3 x){return AW3_AH3(x*AH3_AW3(AW3_(1)));} + AW4 AZolH4ToW4(AH4 x){return AW4_AH4(x*AH4_AW4(AW4_(1)));} +//------------------------------------------------------------------------------------------------------------------------------ + // AMD arch lacks a packed conversion opcode. + AH1 AZolW1ToH1(AW1 x){return AH1_AW1(x*AW1_AH1(AH1_(1.0)));} + AH2 AZolW2ToH2(AW2 x){return AH2_AW2(x*AW2_AH2(AH2_(1.0)));} + AH3 AZolW1ToH3(AW3 x){return AH3_AW3(x*AW3_AH3(AH3_(1.0)));} + AH4 AZolW2ToH4(AW4 x){return AH4_AW4(x*AW4_AH4(AH4_(1.0)));} +//============================================================================================================================== + AH1 AZolAndH1(AH1 x,AH1 y){return min(x,y);} + AH2 AZolAndH2(AH2 x,AH2 y){return min(x,y);} + AH3 AZolAndH3(AH3 x,AH3 y){return min(x,y);} + AH4 AZolAndH4(AH4 x,AH4 y){return min(x,y);} +//------------------------------------------------------------------------------------------------------------------------------ + AH1 ASolAndNotH1(AH1 x,AH1 y){return (-x)*y+AH1_(1.0);} + AH2 ASolAndNotH2(AH2 x,AH2 y){return (-x)*y+AH2_(1.0);} + AH3 ASolAndNotH3(AH3 x,AH3 y){return (-x)*y+AH3_(1.0);} + AH4 ASolAndNotH4(AH4 x,AH4 y){return (-x)*y+AH4_(1.0);} +//------------------------------------------------------------------------------------------------------------------------------ + AH1 AZolAndOrH1(AH1 x,AH1 y,AH1 z){return ASatH1(x*y+z);} + AH2 AZolAndOrH2(AH2 x,AH2 y,AH2 z){return ASatH2(x*y+z);} + AH3 AZolAndOrH3(AH3 x,AH3 y,AH3 z){return ASatH3(x*y+z);} + AH4 AZolAndOrH4(AH4 x,AH4 y,AH4 z){return ASatH4(x*y+z);} +//------------------------------------------------------------------------------------------------------------------------------ + AH1 AZolGtZeroH1(AH1 x){return ASatH1(x*AH1_(A_INFP_H));} + AH2 AZolGtZeroH2(AH2 x){return ASatH2(x*AH2_(A_INFP_H));} + AH3 AZolGtZeroH3(AH3 x){return ASatH3(x*AH3_(A_INFP_H));} + AH4 AZolGtZeroH4(AH4 x){return ASatH4(x*AH4_(A_INFP_H));} +//------------------------------------------------------------------------------------------------------------------------------ + AH1 AZolNotH1(AH1 x){return AH1_(1.0)-x;} + AH2 AZolNotH2(AH2 x){return AH2_(1.0)-x;} + AH3 AZolNotH3(AH3 x){return AH3_(1.0)-x;} + AH4 AZolNotH4(AH4 x){return AH4_(1.0)-x;} +//------------------------------------------------------------------------------------------------------------------------------ + AH1 AZolOrH1(AH1 x,AH1 y){return max(x,y);} + AH2 AZolOrH2(AH2 x,AH2 y){return max(x,y);} + AH3 AZolOrH3(AH3 x,AH3 y){return max(x,y);} + AH4 AZolOrH4(AH4 x,AH4 y){return max(x,y);} +//------------------------------------------------------------------------------------------------------------------------------ + AH1 AZolSelH1(AH1 x,AH1 y,AH1 z){AH1 r=(-x)*z+z;return x*y+r;} + AH2 AZolSelH2(AH2 x,AH2 y,AH2 z){AH2 r=(-x)*z+z;return x*y+r;} + AH3 AZolSelH3(AH3 x,AH3 y,AH3 z){AH3 r=(-x)*z+z;return x*y+r;} + AH4 AZolSelH4(AH4 x,AH4 y,AH4 z){AH4 r=(-x)*z+z;return x*y+r;} +//------------------------------------------------------------------------------------------------------------------------------ + AH1 AZolSignedH1(AH1 x){return ASatH1(x*AH1_(A_INFN_H));} + AH2 AZolSignedH2(AH2 x){return ASatH2(x*AH2_(A_INFN_H));} + AH3 AZolSignedH3(AH3 x){return ASatH3(x*AH3_(A_INFN_H));} + AH4 AZolSignedH4(AH4 x){return ASatH4(x*AH4_(A_INFN_H));} + #endif +//////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////// +//////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////// +//_____________________________________________________________/\_______________________________________________________________ +//============================================================================================================================== +// COLOR CONVERSIONS +//------------------------------------------------------------------------------------------------------------------------------ +// These are all linear to/from some other space (where 'linear' has been shortened out of the function name). +// So 'ToGamma' is 'LinearToGamma', and 'FromGamma' is 'LinearFromGamma'. +// These are branch free implementations. +// The AToSrgbF1() function is useful for stores for compute shaders for GPUs without hardware linear->sRGB store conversion. +//------------------------------------------------------------------------------------------------------------------------------ +// TRANSFER FUNCTIONS +// ================== +// 709 ..... Rec709 used for some HDTVs +// Gamma ... Typically 2.2 for some PC displays, or 2.4-2.5 for CRTs, or 2.2 FreeSync2 native +// Pq ...... PQ native for HDR10 +// Srgb .... The sRGB output, typical of PC displays, useful for 10-bit output, or storing to 8-bit UNORM without SRGB type +// Two ..... Gamma 2.0, fastest conversion (useful for intermediate pass approximations) +// Three ... Gamma 3.0, less fast, but good for HDR. +//------------------------------------------------------------------------------------------------------------------------------ +// KEEPING TO SPEC +// =============== +// Both Rec.709 and sRGB have a linear segment which as spec'ed would intersect the curved segment 2 times. +// (a.) For 8-bit sRGB, steps {0 to 10.3} are in the linear region (4% of the encoding range). +// (b.) For 8-bit 709, steps {0 to 20.7} are in the linear region (8% of the encoding range). +// Also there is a slight step in the transition regions. +// Precision of the coefficients in the spec being the likely cause. +// Main usage case of the sRGB code is to do the linear->sRGB converstion in a compute shader before store. +// This is to work around lack of hardware (typically only ROP does the conversion for free). +// To "correct" the linear segment, would be to introduce error, because hardware decode of sRGB->linear is fixed (and free). +// So this header keeps with the spec. +// For linear->sRGB transforms, the linear segment in some respects reduces error, because rounding in that region is linear. +// Rounding in the curved region in hardware (and fast software code) introduces error due to rounding in non-linear. +//------------------------------------------------------------------------------------------------------------------------------ +// FOR PQ +// ====== +// Both input and output is {0.0-1.0}, and where output 1.0 represents 10000.0 cd/m^2. +// All constants are only specified to FP32 precision. +// External PQ source reference, +// - https://github.com/ampas/aces-dev/blob/master/transforms/ctl/utilities/ACESlib.Utilities_Color.a1.0.1.ctl +//------------------------------------------------------------------------------------------------------------------------------ +// PACKED VERSIONS +// =============== +// These are the A*H2() functions. +// There is no PQ functions as FP16 seemed to not have enough precision for the conversion. +// The remaining functions are "good enough" for 8-bit, and maybe 10-bit if not concerned about a few 1-bit errors. +// Precision is lowest in the 709 conversion, higher in sRGB, higher still in Two and Gamma (when using 2.2 at least). +//------------------------------------------------------------------------------------------------------------------------------ +// NOTES +// ===== +// Could be faster for PQ conversions to be in ALU or a texture lookup depending on usage case. +//============================================================================================================================== + #if 1 + AF1 ATo709F1(AF1 c){AF3 j=AF3(0.018*4.5,4.5,0.45);AF2 k=AF2(1.099,-0.099); + return clamp(j.x ,c*j.y ,pow(c,j.z )*k.x +k.y );} + AF2 ATo709F2(AF2 c){AF3 j=AF3(0.018*4.5,4.5,0.45);AF2 k=AF2(1.099,-0.099); + return clamp(j.xx ,c*j.yy ,pow(c,j.zz )*k.xx +k.yy );} + AF3 ATo709F3(AF3 c){AF3 j=AF3(0.018*4.5,4.5,0.45);AF2 k=AF2(1.099,-0.099); + return clamp(j.xxx,c*j.yyy,pow(c,j.zzz)*k.xxx+k.yyy);} +//------------------------------------------------------------------------------------------------------------------------------ + // Note 'rcpX' is '1/x', where the 'x' is what would be used in AFromGamma(). + AF1 AToGammaF1(AF1 c,AF1 rcpX){return pow(c,AF1_(rcpX));} + AF2 AToGammaF2(AF2 c,AF1 rcpX){return pow(c,AF2_(rcpX));} + AF3 AToGammaF3(AF3 c,AF1 rcpX){return pow(c,AF3_(rcpX));} +//------------------------------------------------------------------------------------------------------------------------------ + AF1 AToPqF1(AF1 x){AF1 p=pow(x,AF1_(0.159302)); + return pow((AF1_(0.835938)+AF1_(18.8516)*p)/(AF1_(1.0)+AF1_(18.6875)*p),AF1_(78.8438));} + AF2 AToPqF1(AF2 x){AF2 p=pow(x,AF2_(0.159302)); + return pow((AF2_(0.835938)+AF2_(18.8516)*p)/(AF2_(1.0)+AF2_(18.6875)*p),AF2_(78.8438));} + AF3 AToPqF1(AF3 x){AF3 p=pow(x,AF3_(0.159302)); + return pow((AF3_(0.835938)+AF3_(18.8516)*p)/(AF3_(1.0)+AF3_(18.6875)*p),AF3_(78.8438));} +//------------------------------------------------------------------------------------------------------------------------------ + AF1 AToSrgbF1(AF1 c){AF3 j=AF3(0.0031308*12.92,12.92,1.0/2.4);AF2 k=AF2(1.055,-0.055); + return clamp(j.x ,c*j.y ,pow(c,j.z )*k.x +k.y );} + AF2 AToSrgbF2(AF2 c){AF3 j=AF3(0.0031308*12.92,12.92,1.0/2.4);AF2 k=AF2(1.055,-0.055); + return clamp(j.xx ,c*j.yy ,pow(c,j.zz )*k.xx +k.yy );} + AF3 AToSrgbF3(AF3 c){AF3 j=AF3(0.0031308*12.92,12.92,1.0/2.4);AF2 k=AF2(1.055,-0.055); + return clamp(j.xxx,c*j.yyy,pow(c,j.zzz)*k.xxx+k.yyy);} +//------------------------------------------------------------------------------------------------------------------------------ + AF1 AToTwoF1(AF1 c){return sqrt(c);} + AF2 AToTwoF2(AF2 c){return sqrt(c);} + AF3 AToTwoF3(AF3 c){return sqrt(c);} +//------------------------------------------------------------------------------------------------------------------------------ + AF1 AToThreeF1(AF1 c){return pow(c,AF1_(1.0/3.0));} + AF2 AToThreeF2(AF2 c){return pow(c,AF2_(1.0/3.0));} + AF3 AToThreeF3(AF3 c){return pow(c,AF3_(1.0/3.0));} + #endif +//============================================================================================================================== + #if 1 + // Unfortunately median won't work here. + AF1 AFrom709F1(AF1 c){AF3 j=AF3(0.081/4.5,1.0/4.5,1.0/0.45);AF2 k=AF2(1.0/1.099,0.099/1.099); + return AZolSelF1(AZolSignedF1(c-j.x ),c*j.y ,pow(c*k.x +k.y ,j.z ));} + AF2 AFrom709F2(AF2 c){AF3 j=AF3(0.081/4.5,1.0/4.5,1.0/0.45);AF2 k=AF2(1.0/1.099,0.099/1.099); + return AZolSelF2(AZolSignedF2(c-j.xx ),c*j.yy ,pow(c*k.xx +k.yy ,j.zz ));} + AF3 AFrom709F3(AF3 c){AF3 j=AF3(0.081/4.5,1.0/4.5,1.0/0.45);AF2 k=AF2(1.0/1.099,0.099/1.099); + return AZolSelF3(AZolSignedF3(c-j.xxx),c*j.yyy,pow(c*k.xxx+k.yyy,j.zzz));} +//------------------------------------------------------------------------------------------------------------------------------ + AF1 AFromGammaF1(AF1 c,AF1 x){return pow(c,AF1_(x));} + AF2 AFromGammaF2(AF2 c,AF1 x){return pow(c,AF2_(x));} + AF3 AFromGammaF3(AF3 c,AF1 x){return pow(c,AF3_(x));} +//------------------------------------------------------------------------------------------------------------------------------ + AF1 AFromPqF1(AF1 x){AF1 p=pow(x,AF1_(0.0126833)); + return pow(ASatF1(p-AF1_(0.835938))/(AF1_(18.8516)-AF1_(18.6875)*p),AF1_(6.27739));} + AF2 AFromPqF1(AF2 x){AF2 p=pow(x,AF2_(0.0126833)); + return pow(ASatF2(p-AF2_(0.835938))/(AF2_(18.8516)-AF2_(18.6875)*p),AF2_(6.27739));} + AF3 AFromPqF1(AF3 x){AF3 p=pow(x,AF3_(0.0126833)); + return pow(ASatF3(p-AF3_(0.835938))/(AF3_(18.8516)-AF3_(18.6875)*p),AF3_(6.27739));} +//------------------------------------------------------------------------------------------------------------------------------ + // Unfortunately median won't work here. + AF1 AFromSrgbF1(AF1 c){AF3 j=AF3(0.04045/12.92,1.0/12.92,2.4);AF2 k=AF2(1.0/1.055,0.055/1.055); + return AZolSelF1(AZolSignedF1(c-j.x ),c*j.y ,pow(c*k.x +k.y ,j.z ));} + AF2 AFromSrgbF2(AF2 c){AF3 j=AF3(0.04045/12.92,1.0/12.92,2.4);AF2 k=AF2(1.0/1.055,0.055/1.055); + return AZolSelF2(AZolSignedF2(c-j.xx ),c*j.yy ,pow(c*k.xx +k.yy ,j.zz ));} + AF3 AFromSrgbF3(AF3 c){AF3 j=AF3(0.04045/12.92,1.0/12.92,2.4);AF2 k=AF2(1.0/1.055,0.055/1.055); + return AZolSelF3(AZolSignedF3(c-j.xxx),c*j.yyy,pow(c*k.xxx+k.yyy,j.zzz));} +//------------------------------------------------------------------------------------------------------------------------------ + AF1 AFromTwoF1(AF1 c){return c*c;} + AF2 AFromTwoF2(AF2 c){return c*c;} + AF3 AFromTwoF3(AF3 c){return c*c;} +//------------------------------------------------------------------------------------------------------------------------------ + AF1 AFromThreeF1(AF1 c){return c*c*c;} + AF2 AFromThreeF2(AF2 c){return c*c*c;} + AF3 AFromThreeF3(AF3 c){return c*c*c;} + #endif +//============================================================================================================================== + #ifdef A_HALF + AH1 ATo709H1(AH1 c){AH3 j=AH3(0.018*4.5,4.5,0.45);AH2 k=AH2(1.099,-0.099); + return clamp(j.x ,c*j.y ,pow(c,j.z )*k.x +k.y );} + AH2 ATo709H2(AH2 c){AH3 j=AH3(0.018*4.5,4.5,0.45);AH2 k=AH2(1.099,-0.099); + return clamp(j.xx ,c*j.yy ,pow(c,j.zz )*k.xx +k.yy );} + AH3 ATo709H3(AH3 c){AH3 j=AH3(0.018*4.5,4.5,0.45);AH2 k=AH2(1.099,-0.099); + return clamp(j.xxx,c*j.yyy,pow(c,j.zzz)*k.xxx+k.yyy);} +//------------------------------------------------------------------------------------------------------------------------------ + AH1 AToGammaH1(AH1 c,AH1 rcpX){return pow(c,AH1_(rcpX));} + AH2 AToGammaH2(AH2 c,AH1 rcpX){return pow(c,AH2_(rcpX));} + AH3 AToGammaH3(AH3 c,AH1 rcpX){return pow(c,AH3_(rcpX));} +//------------------------------------------------------------------------------------------------------------------------------ + AH1 AToSrgbH1(AH1 c){AH3 j=AH3(0.0031308*12.92,12.92,1.0/2.4);AH2 k=AH2(1.055,-0.055); + return clamp(j.x ,c*j.y ,pow(c,j.z )*k.x +k.y );} + AH2 AToSrgbH2(AH2 c){AH3 j=AH3(0.0031308*12.92,12.92,1.0/2.4);AH2 k=AH2(1.055,-0.055); + return clamp(j.xx ,c*j.yy ,pow(c,j.zz )*k.xx +k.yy );} + AH3 AToSrgbH3(AH3 c){AH3 j=AH3(0.0031308*12.92,12.92,1.0/2.4);AH2 k=AH2(1.055,-0.055); + return clamp(j.xxx,c*j.yyy,pow(c,j.zzz)*k.xxx+k.yyy);} +//------------------------------------------------------------------------------------------------------------------------------ + AH1 AToTwoH1(AH1 c){return sqrt(c);} + AH2 AToTwoH2(AH2 c){return sqrt(c);} + AH3 AToTwoH3(AH3 c){return sqrt(c);} +//------------------------------------------------------------------------------------------------------------------------------ + AH1 AToThreeF1(AH1 c){return pow(c,AH1_(1.0/3.0));} + AH2 AToThreeF2(AH2 c){return pow(c,AH2_(1.0/3.0));} + AH3 AToThreeF3(AH3 c){return pow(c,AH3_(1.0/3.0));} + #endif +//============================================================================================================================== + #ifdef A_HALF + AH1 AFrom709H1(AH1 c){AH3 j=AH3(0.081/4.5,1.0/4.5,1.0/0.45);AH2 k=AH2(1.0/1.099,0.099/1.099); + return AZolSelH1(AZolSignedH1(c-j.x ),c*j.y ,pow(c*k.x +k.y ,j.z ));} + AH2 AFrom709H2(AH2 c){AH3 j=AH3(0.081/4.5,1.0/4.5,1.0/0.45);AH2 k=AH2(1.0/1.099,0.099/1.099); + return AZolSelH2(AZolSignedH2(c-j.xx ),c*j.yy ,pow(c*k.xx +k.yy ,j.zz ));} + AH3 AFrom709H3(AH3 c){AH3 j=AH3(0.081/4.5,1.0/4.5,1.0/0.45);AH2 k=AH2(1.0/1.099,0.099/1.099); + return AZolSelH3(AZolSignedH3(c-j.xxx),c*j.yyy,pow(c*k.xxx+k.yyy,j.zzz));} +//------------------------------------------------------------------------------------------------------------------------------ + AH1 AFromGammaH1(AH1 c,AH1 x){return pow(c,AH1_(x));} + AH2 AFromGammaH2(AH2 c,AH1 x){return pow(c,AH2_(x));} + AH3 AFromGammaH3(AH3 c,AH1 x){return pow(c,AH3_(x));} +//------------------------------------------------------------------------------------------------------------------------------ + AH1 AHromSrgbF1(AH1 c){AH3 j=AH3(0.04045/12.92,1.0/12.92,2.4);AH2 k=AH2(1.0/1.055,0.055/1.055); + return AZolSelH1(AZolSignedH1(c-j.x ),c*j.y ,pow(c*k.x +k.y ,j.z ));} + AH2 AHromSrgbF2(AH2 c){AH3 j=AH3(0.04045/12.92,1.0/12.92,2.4);AH2 k=AH2(1.0/1.055,0.055/1.055); + return AZolSelH2(AZolSignedH2(c-j.xx ),c*j.yy ,pow(c*k.xx +k.yy ,j.zz ));} + AH3 AHromSrgbF3(AH3 c){AH3 j=AH3(0.04045/12.92,1.0/12.92,2.4);AH2 k=AH2(1.0/1.055,0.055/1.055); + return AZolSelH3(AZolSignedH3(c-j.xxx),c*j.yyy,pow(c*k.xxx+k.yyy,j.zzz));} +//------------------------------------------------------------------------------------------------------------------------------ + AH1 AFromTwoH1(AH1 c){return c*c;} + AH2 AFromTwoH2(AH2 c){return c*c;} + AH3 AFromTwoH3(AH3 c){return c*c;} +//------------------------------------------------------------------------------------------------------------------------------ + AH1 AFromThreeH1(AH1 c){return c*c*c;} + AH2 AFromThreeH2(AH2 c){return c*c*c;} + AH3 AFromThreeH3(AH3 c){return c*c*c;} + #endif +//////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////// +//////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////// +//_____________________________________________________________/\_______________________________________________________________ +//============================================================================================================================== +// CS REMAP +//============================================================================================================================== + // Simple remap 64x1 to 8x8 with rotated 2x2 pixel quads in quad linear. + // 543210 + // ====== + // ..xxx. + // yy...y + AU2 ARmp8x8(AU1 a){return AU2(ABfe(a,1u,3u),ABfiM(ABfe(a,3u,3u),a,1u));} +//============================================================================================================================== + // More complex remap 64x1 to 8x8 which is necessary for 2D wave reductions. + // 543210 + // ====== + // .xx..x + // y..yy. + // Details, + // LANE TO 8x8 MAPPING + // =================== + // 00 01 08 09 10 11 18 19 + // 02 03 0a 0b 12 13 1a 1b + // 04 05 0c 0d 14 15 1c 1d + // 06 07 0e 0f 16 17 1e 1f + // 20 21 28 29 30 31 38 39 + // 22 23 2a 2b 32 33 3a 3b + // 24 25 2c 2d 34 35 3c 3d + // 26 27 2e 2f 36 37 3e 3f + AU2 ARmpRed8x8(AU1 a){return AU2(ABfiM(ABfe(a,2u,3u),a,1u),ABfiM(ABfe(a,3u,3u),ABfe(a,1u,2u),2u));} +//============================================================================================================================== + #ifdef A_HALF + AW2 ARmp8x8H(AU1 a){return AW2(ABfe(a,1u,3u),ABfiM(ABfe(a,3u,3u),a,1u));} + AW2 ARmpRed8x8H(AU1 a){return AW2(ABfiM(ABfe(a,2u,3u),a,1u),ABfiM(ABfe(a,3u,3u),ABfe(a,1u,2u),2u));} + #endif +#endif +//////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////// +//////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////// +//////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////// +//////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////// +//_____________________________________________________________/\_______________________________________________________________ +//============================================================================================================================== +// +// REFERENCE +// +//------------------------------------------------------------------------------------------------------------------------------ +// IEEE FLOAT RULES +// ================ +// - saturate(NaN)=0, saturate(-INF)=0, saturate(+INF)=1 +// - {+/-}0 * {+/-}INF = NaN +// - -INF + (+INF) = NaN +// - {+/-}0 / {+/-}0 = NaN +// - {+/-}INF / {+/-}INF = NaN +// - a<(-0) := sqrt(a) = NaN (a=-0.0 won't NaN) +// - 0 == -0 +// - 4/0 = +INF +// - 4/-0 = -INF +// - 4+INF = +INF +// - 4-INF = -INF +// - 4*(+INF) = +INF +// - 4*(-INF) = -INF +// - -4*(+INF) = -INF +// - sqrt(+INF) = +INF +//------------------------------------------------------------------------------------------------------------------------------ +// FP16 ENCODING +// ============= +// fedcba9876543210 +// ---------------- +// ......mmmmmmmmmm 10-bit mantissa (encodes 11-bit 0.5 to 1.0 except for denormals) +// .eeeee.......... 5-bit exponent +// .00000.......... denormals +// .00001.......... -14 exponent +// .11110.......... 15 exponent +// .111110000000000 infinity +// .11111nnnnnnnnnn NaN with n!=0 +// s............... sign +//------------------------------------------------------------------------------------------------------------------------------ +// FP16/INT16 ALIASING DENORMAL +// ============================ +// 11-bit unsigned integers alias with half float denormal/normal values, +// 1 = 2^(-24) = 1/16777216 ....................... first denormal value +// 2 = 2^(-23) +// ... +// 1023 = 2^(-14)*(1-2^(-10)) = 2^(-14)*(1-1/1024) ... last denormal value +// 1024 = 2^(-14) = 1/16384 .......................... first normal value that still maps to integers +// 2047 .............................................. last normal value that still maps to integers +// Scaling limits, +// 2^15 = 32768 ...................................... largest power of 2 scaling +// Largest pow2 conversion mapping is at *32768, +// 1 : 2^(-9) = 1/512 +// 2 : 1/256 +// 4 : 1/128 +// 8 : 1/64 +// 16 : 1/32 +// 32 : 1/16 +// 64 : 1/8 +// 128 : 1/4 +// 256 : 1/2 +// 512 : 1 +// 1024 : 2 +// 2047 : a little less than 4 +//============================================================================================================================== +//////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////// +//////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////// +//////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////// +//////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////// +//////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////// +//////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////// +//_____________________________________________________________/\_______________________________________________________________ +//============================================================================================================================== +// +// +// GPU/CPU PORTABILITY +// +// +//------------------------------------------------------------------------------------------------------------------------------ +// This is the GPU implementation. +// See the CPU implementation for docs. +//============================================================================================================================== +#ifdef A_GPU + #define A_TRUE true + #define A_FALSE false + #define A_STATIC +//////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////// +//////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////// +//_____________________________________________________________/\_______________________________________________________________ +//============================================================================================================================== +// VECTOR ARGUMENT/RETURN/INITIALIZATION PORTABILITY +//============================================================================================================================== + #define retAD2 AD2 + #define retAD3 AD3 + #define retAD4 AD4 + #define retAF2 AF2 + #define retAF3 AF3 + #define retAF4 AF4 + #define retAL2 AL2 + #define retAL3 AL3 + #define retAL4 AL4 + #define retAU2 AU2 + #define retAU3 AU3 + #define retAU4 AU4 +//------------------------------------------------------------------------------------------------------------------------------ + #define inAD2 in AD2 + #define inAD3 in AD3 + #define inAD4 in AD4 + #define inAF2 in AF2 + #define inAF3 in AF3 + #define inAF4 in AF4 + #define inAL2 in AL2 + #define inAL3 in AL3 + #define inAL4 in AL4 + #define inAU2 in AU2 + #define inAU3 in AU3 + #define inAU4 in AU4 +//------------------------------------------------------------------------------------------------------------------------------ + #define inoutAD2 inout AD2 + #define inoutAD3 inout AD3 + #define inoutAD4 inout AD4 + #define inoutAF2 inout AF2 + #define inoutAF3 inout AF3 + #define inoutAF4 inout AF4 + #define inoutAL2 inout AL2 + #define inoutAL3 inout AL3 + #define inoutAL4 inout AL4 + #define inoutAU2 inout AU2 + #define inoutAU3 inout AU3 + #define inoutAU4 inout AU4 +//------------------------------------------------------------------------------------------------------------------------------ + #define outAD2 out AD2 + #define outAD3 out AD3 + #define outAD4 out AD4 + #define outAF2 out AF2 + #define outAF3 out AF3 + #define outAF4 out AF4 + #define outAL2 out AL2 + #define outAL3 out AL3 + #define outAL4 out AL4 + #define outAU2 out AU2 + #define outAU3 out AU3 + #define outAU4 out AU4 +//------------------------------------------------------------------------------------------------------------------------------ + #define varAD2(x) AD2 x + #define varAD3(x) AD3 x + #define varAD4(x) AD4 x + #define varAF2(x) AF2 x + #define varAF3(x) AF3 x + #define varAF4(x) AF4 x + #define varAL2(x) AL2 x + #define varAL3(x) AL3 x + #define varAL4(x) AL4 x + #define varAU2(x) AU2 x + #define varAU3(x) AU3 x + #define varAU4(x) AU4 x +//------------------------------------------------------------------------------------------------------------------------------ + #define initAD2(x,y) AD2(x,y) + #define initAD3(x,y,z) AD3(x,y,z) + #define initAD4(x,y,z,w) AD4(x,y,z,w) + #define initAF2(x,y) AF2(x,y) + #define initAF3(x,y,z) AF3(x,y,z) + #define initAF4(x,y,z,w) AF4(x,y,z,w) + #define initAL2(x,y) AL2(x,y) + #define initAL3(x,y,z) AL3(x,y,z) + #define initAL4(x,y,z,w) AL4(x,y,z,w) + #define initAU2(x,y) AU2(x,y) + #define initAU3(x,y,z) AU3(x,y,z) + #define initAU4(x,y,z,w) AU4(x,y,z,w) +//////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////// +//////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////// +//_____________________________________________________________/\_______________________________________________________________ +//============================================================================================================================== +// SCALAR RETURN OPS +//============================================================================================================================== + #define AAbsD1(a) abs(AD1(a)) + #define AAbsF1(a) abs(AF1(a)) +//------------------------------------------------------------------------------------------------------------------------------ + #define ACosD1(a) cos(AD1(a)) + #define ACosF1(a) cos(AF1(a)) +//------------------------------------------------------------------------------------------------------------------------------ + #define ADotD2(a,b) dot(AD2(a),AD2(b)) + #define ADotD3(a,b) dot(AD3(a),AD3(b)) + #define ADotD4(a,b) dot(AD4(a),AD4(b)) + #define ADotF2(a,b) dot(AF2(a),AF2(b)) + #define ADotF3(a,b) dot(AF3(a),AF3(b)) + #define ADotF4(a,b) dot(AF4(a),AF4(b)) +//------------------------------------------------------------------------------------------------------------------------------ + #define AExp2D1(a) exp2(AD1(a)) + #define AExp2F1(a) exp2(AF1(a)) +//------------------------------------------------------------------------------------------------------------------------------ + #define AFloorD1(a) floor(AD1(a)) + #define AFloorF1(a) floor(AF1(a)) +//------------------------------------------------------------------------------------------------------------------------------ + #define ALog2D1(a) log2(AD1(a)) + #define ALog2F1(a) log2(AF1(a)) +//------------------------------------------------------------------------------------------------------------------------------ + #define AMaxD1(a,b) max(a,b) + #define AMaxF1(a,b) max(a,b) + #define AMaxL1(a,b) max(a,b) + #define AMaxU1(a,b) max(a,b) +//------------------------------------------------------------------------------------------------------------------------------ + #define AMinD1(a,b) min(a,b) + #define AMinF1(a,b) min(a,b) + #define AMinL1(a,b) min(a,b) + #define AMinU1(a,b) min(a,b) +//------------------------------------------------------------------------------------------------------------------------------ + #define ASinD1(a) sin(AD1(a)) + #define ASinF1(a) sin(AF1(a)) +//------------------------------------------------------------------------------------------------------------------------------ + #define ASqrtD1(a) sqrt(AD1(a)) + #define ASqrtF1(a) sqrt(AF1(a)) +//////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////// +//////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////// +//_____________________________________________________________/\_______________________________________________________________ +//============================================================================================================================== +// SCALAR RETURN OPS - DEPENDENT +//============================================================================================================================== + #define APowD1(a,b) pow(AD1(a),AF1(b)) + #define APowF1(a,b) pow(AF1(a),AF1(b)) +//////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////// +//////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////// +//_____________________________________________________________/\_______________________________________________________________ +//============================================================================================================================== +// VECTOR OPS +//------------------------------------------------------------------------------------------------------------------------------ +// These are added as needed for production or prototyping, so not necessarily a complete set. +// They follow a convention of taking in a destination and also returning the destination value to increase utility. +//============================================================================================================================== + #ifdef A_DUBL + AD2 opAAbsD2(outAD2 d,inAD2 a){d=abs(a);return d;} + AD3 opAAbsD3(outAD3 d,inAD3 a){d=abs(a);return d;} + AD4 opAAbsD4(outAD4 d,inAD4 a){d=abs(a);return d;} +//------------------------------------------------------------------------------------------------------------------------------ + AD2 opAAddD2(outAD2 d,inAD2 a,inAD2 b){d=a+b;return d;} + AD3 opAAddD3(outAD3 d,inAD3 a,inAD3 b){d=a+b;return d;} + AD4 opAAddD4(outAD4 d,inAD4 a,inAD4 b){d=a+b;return d;} +//------------------------------------------------------------------------------------------------------------------------------ + AD2 opAAddOneD2(outAD2 d,inAD2 a,AD1 b){d=a+AD2_(b);return d;} + AD3 opAAddOneD3(outAD3 d,inAD3 a,AD1 b){d=a+AD3_(b);return d;} + AD4 opAAddOneD4(outAD4 d,inAD4 a,AD1 b){d=a+AD4_(b);return d;} +//------------------------------------------------------------------------------------------------------------------------------ + AD2 opACpyD2(outAD2 d,inAD2 a){d=a;return d;} + AD3 opACpyD3(outAD3 d,inAD3 a){d=a;return d;} + AD4 opACpyD4(outAD4 d,inAD4 a){d=a;return d;} +//------------------------------------------------------------------------------------------------------------------------------ + AD2 opALerpD2(outAD2 d,inAD2 a,inAD2 b,inAD2 c){d=ALerpD2(a,b,c);return d;} + AD3 opALerpD3(outAD3 d,inAD3 a,inAD3 b,inAD3 c){d=ALerpD3(a,b,c);return d;} + AD4 opALerpD4(outAD4 d,inAD4 a,inAD4 b,inAD4 c){d=ALerpD4(a,b,c);return d;} +//------------------------------------------------------------------------------------------------------------------------------ + AD2 opALerpOneD2(outAD2 d,inAD2 a,inAD2 b,AD1 c){d=ALerpD2(a,b,AD2_(c));return d;} + AD3 opALerpOneD3(outAD3 d,inAD3 a,inAD3 b,AD1 c){d=ALerpD3(a,b,AD3_(c));return d;} + AD4 opALerpOneD4(outAD4 d,inAD4 a,inAD4 b,AD1 c){d=ALerpD4(a,b,AD4_(c));return d;} +//------------------------------------------------------------------------------------------------------------------------------ + AD2 opAMaxD2(outAD2 d,inAD2 a,inAD2 b){d=max(a,b);return d;} + AD3 opAMaxD3(outAD3 d,inAD3 a,inAD3 b){d=max(a,b);return d;} + AD4 opAMaxD4(outAD4 d,inAD4 a,inAD4 b){d=max(a,b);return d;} +//------------------------------------------------------------------------------------------------------------------------------ + AD2 opAMinD2(outAD2 d,inAD2 a,inAD2 b){d=min(a,b);return d;} + AD3 opAMinD3(outAD3 d,inAD3 a,inAD3 b){d=min(a,b);return d;} + AD4 opAMinD4(outAD4 d,inAD4 a,inAD4 b){d=min(a,b);return d;} +//------------------------------------------------------------------------------------------------------------------------------ + AD2 opAMulD2(outAD2 d,inAD2 a,inAD2 b){d=a*b;return d;} + AD3 opAMulD3(outAD3 d,inAD3 a,inAD3 b){d=a*b;return d;} + AD4 opAMulD4(outAD4 d,inAD4 a,inAD4 b){d=a*b;return d;} +//------------------------------------------------------------------------------------------------------------------------------ + AD2 opAMulOneD2(outAD2 d,inAD2 a,AD1 b){d=a*AD2_(b);return d;} + AD3 opAMulOneD3(outAD3 d,inAD3 a,AD1 b){d=a*AD3_(b);return d;} + AD4 opAMulOneD4(outAD4 d,inAD4 a,AD1 b){d=a*AD4_(b);return d;} +//------------------------------------------------------------------------------------------------------------------------------ + AD2 opANegD2(outAD2 d,inAD2 a){d=-a;return d;} + AD3 opANegD3(outAD3 d,inAD3 a){d=-a;return d;} + AD4 opANegD4(outAD4 d,inAD4 a){d=-a;return d;} +//------------------------------------------------------------------------------------------------------------------------------ + AD2 opARcpD2(outAD2 d,inAD2 a){d=ARcpD2(a);return d;} + AD3 opARcpD3(outAD3 d,inAD3 a){d=ARcpD3(a);return d;} + AD4 opARcpD4(outAD4 d,inAD4 a){d=ARcpD4(a);return d;} + #endif +//============================================================================================================================== + AF2 opAAbsF2(outAF2 d,inAF2 a){d=abs(a);return d;} + AF3 opAAbsF3(outAF3 d,inAF3 a){d=abs(a);return d;} + AF4 opAAbsF4(outAF4 d,inAF4 a){d=abs(a);return d;} +//------------------------------------------------------------------------------------------------------------------------------ + AF2 opAAddF2(outAF2 d,inAF2 a,inAF2 b){d=a+b;return d;} + AF3 opAAddF3(outAF3 d,inAF3 a,inAF3 b){d=a+b;return d;} + AF4 opAAddF4(outAF4 d,inAF4 a,inAF4 b){d=a+b;return d;} +//------------------------------------------------------------------------------------------------------------------------------ + AF2 opAAddOneF2(outAF2 d,inAF2 a,AF1 b){d=a+AF2_(b);return d;} + AF3 opAAddOneF3(outAF3 d,inAF3 a,AF1 b){d=a+AF3_(b);return d;} + AF4 opAAddOneF4(outAF4 d,inAF4 a,AF1 b){d=a+AF4_(b);return d;} +//------------------------------------------------------------------------------------------------------------------------------ + AF2 opACpyF2(outAF2 d,inAF2 a){d=a;return d;} + AF3 opACpyF3(outAF3 d,inAF3 a){d=a;return d;} + AF4 opACpyF4(outAF4 d,inAF4 a){d=a;return d;} +//------------------------------------------------------------------------------------------------------------------------------ + AF2 opALerpF2(outAF2 d,inAF2 a,inAF2 b,inAF2 c){d=ALerpF2(a,b,c);return d;} + AF3 opALerpF3(outAF3 d,inAF3 a,inAF3 b,inAF3 c){d=ALerpF3(a,b,c);return d;} + AF4 opALerpF4(outAF4 d,inAF4 a,inAF4 b,inAF4 c){d=ALerpF4(a,b,c);return d;} +//------------------------------------------------------------------------------------------------------------------------------ + AF2 opALerpOneF2(outAF2 d,inAF2 a,inAF2 b,AF1 c){d=ALerpF2(a,b,AF2_(c));return d;} + AF3 opALerpOneF3(outAF3 d,inAF3 a,inAF3 b,AF1 c){d=ALerpF3(a,b,AF3_(c));return d;} + AF4 opALerpOneF4(outAF4 d,inAF4 a,inAF4 b,AF1 c){d=ALerpF4(a,b,AF4_(c));return d;} +//------------------------------------------------------------------------------------------------------------------------------ + AF2 opAMaxF2(outAF2 d,inAF2 a,inAF2 b){d=max(a,b);return d;} + AF3 opAMaxF3(outAF3 d,inAF3 a,inAF3 b){d=max(a,b);return d;} + AF4 opAMaxF4(outAF4 d,inAF4 a,inAF4 b){d=max(a,b);return d;} +//------------------------------------------------------------------------------------------------------------------------------ + AF2 opAMinF2(outAF2 d,inAF2 a,inAF2 b){d=min(a,b);return d;} + AF3 opAMinF3(outAF3 d,inAF3 a,inAF3 b){d=min(a,b);return d;} + AF4 opAMinF4(outAF4 d,inAF4 a,inAF4 b){d=min(a,b);return d;} +//------------------------------------------------------------------------------------------------------------------------------ + AF2 opAMulF2(outAF2 d,inAF2 a,inAF2 b){d=a*b;return d;} + AF3 opAMulF3(outAF3 d,inAF3 a,inAF3 b){d=a*b;return d;} + AF4 opAMulF4(outAF4 d,inAF4 a,inAF4 b){d=a*b;return d;} +//------------------------------------------------------------------------------------------------------------------------------ + AF2 opAMulOneF2(outAF2 d,inAF2 a,AF1 b){d=a*AF2_(b);return d;} + AF3 opAMulOneF3(outAF3 d,inAF3 a,AF1 b){d=a*AF3_(b);return d;} + AF4 opAMulOneF4(outAF4 d,inAF4 a,AF1 b){d=a*AF4_(b);return d;} +//------------------------------------------------------------------------------------------------------------------------------ + AF2 opANegF2(outAF2 d,inAF2 a){d=-a;return d;} + AF3 opANegF3(outAF3 d,inAF3 a){d=-a;return d;} + AF4 opANegF4(outAF4 d,inAF4 a){d=-a;return d;} +//------------------------------------------------------------------------------------------------------------------------------ + AF2 opARcpF2(outAF2 d,inAF2 a){d=ARcpF2(a);return d;} + AF3 opARcpF3(outAF3 d,inAF3 a){d=ARcpF3(a);return d;} + AF4 opARcpF4(outAF4 d,inAF4 a){d=ARcpF4(a);return d;} +#endif Property changes on: ps/trunk/binaries/data/mods/mod/shaders/glsl/ffx_a.h ___________________________________________________________________ Added: svn:eol-style ## -0,0 +1 ## +native \ No newline at end of property Index: ps/trunk/binaries/data/mods/mod/shaders/glsl/ffx_fsr1.h =================================================================== --- ps/trunk/binaries/data/mods/mod/shaders/glsl/ffx_fsr1.h (nonexistent) +++ ps/trunk/binaries/data/mods/mod/shaders/glsl/ffx_fsr1.h (revision 28010) @@ -0,0 +1,1199 @@ +//_____________________________________________________________/\_______________________________________________________________ +//============================================================================================================================== +// +// +// AMD FidelityFX SUPER RESOLUTION [FSR 1] ::: SPATIAL SCALING & EXTRAS - v1.20210629 +// +// +//------------------------------------------------------------------------------------------------------------------------------ +//////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////// +//////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////// +//------------------------------------------------------------------------------------------------------------------------------ +// FidelityFX Super Resolution Sample +// +// Copyright (c) 2021 Advanced Micro Devices, Inc. All rights reserved. +// Permission is hereby granted, free of charge, to any person obtaining a copy +// of this software and associated documentation files(the "Software"), to deal +// in the Software without restriction, including without limitation the rights +// to use, copy, modify, merge, publish, distribute, sublicense, and / or sell +// copies of the Software, and to permit persons to whom the Software is +// furnished to do so, subject to the following conditions : +// The above copyright notice and this permission notice shall be included in +// all copies or substantial portions of the Software. +// THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR +// IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, +// FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.IN NO EVENT SHALL THE +// AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER +// LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, +// OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN +// THE SOFTWARE. +//------------------------------------------------------------------------------------------------------------------------------ +//////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////// +//////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////// +//------------------------------------------------------------------------------------------------------------------------------ +// ABOUT +// ===== +// FSR is a collection of algorithms relating to generating a higher resolution image. +// This specific header focuses on single-image non-temporal image scaling, and related tools. +// +// The core functions are EASU and RCAS: +// [EASU] Edge Adaptive Spatial Upsampling ....... 1x to 4x area range spatial scaling, clamped adaptive elliptical filter. +// [RCAS] Robust Contrast Adaptive Sharpening .... A non-scaling variation on CAS. +// RCAS needs to be applied after EASU as a separate pass. +// +// Optional utility functions are: +// [LFGA] Linear Film Grain Applicator ........... Tool to apply film grain after scaling. +// [SRTM] Simple Reversible Tone-Mapper .......... Linear HDR {0 to FP16_MAX} to {0 to 1} and back. +// [TEPD] Temporal Energy Preserving Dither ...... Temporally energy preserving dithered {0 to 1} linear to gamma 2.0 conversion. +// See each individual sub-section for inline documentation. +//------------------------------------------------------------------------------------------------------------------------------ +//////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////// +//////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////// +//------------------------------------------------------------------------------------------------------------------------------ +// FUNCTION PERMUTATIONS +// ===================== +// *F() ..... Single item computation with 32-bit. +// *H() ..... Single item computation with 16-bit, with packing (aka two 16-bit ops in parallel) when possible. +// *Hx2() ... Processing two items in parallel with 16-bit, easier packing. +// Not all interfaces in this file have a *Hx2() form. +//============================================================================================================================== +//////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////// +//////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////// +//////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////// +//////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////// +//_____________________________________________________________/\_______________________________________________________________ +//============================================================================================================================== +// +// FSR - [EASU] EDGE ADAPTIVE SPATIAL UPSAMPLING +// +//------------------------------------------------------------------------------------------------------------------------------ +// EASU provides a high quality spatial-only scaling at relatively low cost. +// Meaning EASU is appropiate for laptops and other low-end GPUs. +// Quality from 1x to 4x area scaling is good. +//------------------------------------------------------------------------------------------------------------------------------ +// The scalar uses a modified fast approximation to the standard lanczos(size=2) kernel. +// EASU runs in a single pass, so it applies a directionally and anisotropically adaptive radial lanczos. +// This is also kept as simple as possible to have minimum runtime. +//------------------------------------------------------------------------------------------------------------------------------ +// The lanzcos filter has negative lobes, so by itself it will introduce ringing. +// To remove all ringing, the algorithm uses the nearest 2x2 input texels as a neighborhood, +// and limits output to the minimum and maximum of that neighborhood. +//------------------------------------------------------------------------------------------------------------------------------ +// Input image requirements: +// +// Color needs to be encoded as 3 channel[red, green, blue](e.g.XYZ not supported) +// Each channel needs to be in the range[0, 1] +// Any color primaries are supported +// Display / tonemapping curve needs to be as if presenting to sRGB display or similar(e.g.Gamma 2.0) +// There should be no banding in the input +// There should be no high amplitude noise in the input +// There should be no noise in the input that is not at input pixel granularity +// For performance purposes, use 32bpp formats +//------------------------------------------------------------------------------------------------------------------------------ +// Best to apply EASU at the end of the frame after tonemapping +// but before film grain or composite of the UI. +//------------------------------------------------------------------------------------------------------------------------------ +// Example of including this header for D3D HLSL : +// +// #define A_GPU 1 +// #define A_HLSL 1 +// #define A_HALF 1 +// #include "ffx_a.h" +// #define FSR_EASU_H 1 +// #define FSR_RCAS_H 1 +// //declare input callbacks +// #include "ffx_fsr1.h" +// +// Example of including this header for Vulkan GLSL : +// +// #define A_GPU 1 +// #define A_GLSL 1 +// #define A_HALF 1 +// #include "ffx_a.h" +// #define FSR_EASU_H 1 +// #define FSR_RCAS_H 1 +// //declare input callbacks +// #include "ffx_fsr1.h" +// +// Example of including this header for Vulkan HLSL : +// +// #define A_GPU 1 +// #define A_HLSL 1 +// #define A_HLSL_6_2 1 +// #define A_NO_16_BIT_CAST 1 +// #define A_HALF 1 +// #include "ffx_a.h" +// #define FSR_EASU_H 1 +// #define FSR_RCAS_H 1 +// //declare input callbacks +// #include "ffx_fsr1.h" +// +// Example of declaring the required input callbacks for GLSL : +// The callbacks need to gather4 for each color channel using the specified texture coordinate 'p'. +// EASU uses gather4 to reduce position computation logic and for free Arrays of Structures to Structures of Arrays conversion. +// +// AH4 FsrEasuRH(AF2 p){return AH4(textureGather(sampler2D(tex,sam),p,0));} +// AH4 FsrEasuGH(AF2 p){return AH4(textureGather(sampler2D(tex,sam),p,1));} +// AH4 FsrEasuBH(AF2 p){return AH4(textureGather(sampler2D(tex,sam),p,2));} +// ... +// The FsrEasuCon function needs to be called from the CPU or GPU to set up constants. +// The difference in viewport and input image size is there to support Dynamic Resolution Scaling. +// To use FsrEasuCon() on the CPU, define A_CPU before including ffx_a and ffx_fsr1. +// Including a GPU example here, the 'con0' through 'con3' values would be stored out to a constant buffer. +// AU4 con0,con1,con2,con3; +// FsrEasuCon(con0,con1,con2,con3, +// 1920.0,1080.0, // Viewport size (top left aligned) in the input image which is to be scaled. +// 3840.0,2160.0, // The size of the input image. +// 2560.0,1440.0); // The output resolution. +//============================================================================================================================== +//////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////// +//////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////// +//_____________________________________________________________/\_______________________________________________________________ +//============================================================================================================================== +// CONSTANT SETUP +//============================================================================================================================== +// Call to setup required constant values (works on CPU or GPU). +A_STATIC void FsrEasuCon( +outAU4 con0, +outAU4 con1, +outAU4 con2, +outAU4 con3, +// This the rendered image resolution being upscaled +AF1 inputViewportInPixelsX, +AF1 inputViewportInPixelsY, +// This is the resolution of the resource containing the input image (useful for dynamic resolution) +AF1 inputSizeInPixelsX, +AF1 inputSizeInPixelsY, +// This is the display resolution which the input image gets upscaled to +AF1 outputSizeInPixelsX, +AF1 outputSizeInPixelsY){ + // Output integer position to a pixel position in viewport. + con0[0]=AU1_AF1(inputViewportInPixelsX*ARcpF1(outputSizeInPixelsX)); + con0[1]=AU1_AF1(inputViewportInPixelsY*ARcpF1(outputSizeInPixelsY)); + con0[2]=AU1_AF1(AF1_(0.5)*inputViewportInPixelsX*ARcpF1(outputSizeInPixelsX)-AF1_(0.5)); + con0[3]=AU1_AF1(AF1_(0.5)*inputViewportInPixelsY*ARcpF1(outputSizeInPixelsY)-AF1_(0.5)); + // Viewport pixel position to normalized image space. + // This is used to get upper-left of 'F' tap. + con1[0]=AU1_AF1(ARcpF1(inputSizeInPixelsX)); + con1[1]=AU1_AF1(ARcpF1(inputSizeInPixelsY)); + // Centers of gather4, first offset from upper-left of 'F'. + // +---+---+ + // | | | + // +--(0)--+ + // | b | c | + // +---F---+---+---+ + // | e | f | g | h | + // +--(1)--+--(2)--+ + // | i | j | k | l | + // +---+---+---+---+ + // | n | o | + // +--(3)--+ + // | | | + // +---+---+ + con1[2]=AU1_AF1(AF1_( 1.0)*ARcpF1(inputSizeInPixelsX)); + con1[3]=AU1_AF1(AF1_(-1.0)*ARcpF1(inputSizeInPixelsY)); + // These are from (0) instead of 'F'. + con2[0]=AU1_AF1(AF1_(-1.0)*ARcpF1(inputSizeInPixelsX)); + con2[1]=AU1_AF1(AF1_( 2.0)*ARcpF1(inputSizeInPixelsY)); + con2[2]=AU1_AF1(AF1_( 1.0)*ARcpF1(inputSizeInPixelsX)); + con2[3]=AU1_AF1(AF1_( 2.0)*ARcpF1(inputSizeInPixelsY)); + con3[0]=AU1_AF1(AF1_( 0.0)*ARcpF1(inputSizeInPixelsX)); + con3[1]=AU1_AF1(AF1_( 4.0)*ARcpF1(inputSizeInPixelsY)); + con3[2]=con3[3]=0;} + +//If the an offset into the input image resource +A_STATIC void FsrEasuConOffset( + outAU4 con0, + outAU4 con1, + outAU4 con2, + outAU4 con3, + // This the rendered image resolution being upscaled + AF1 inputViewportInPixelsX, + AF1 inputViewportInPixelsY, + // This is the resolution of the resource containing the input image (useful for dynamic resolution) + AF1 inputSizeInPixelsX, + AF1 inputSizeInPixelsY, + // This is the display resolution which the input image gets upscaled to + AF1 outputSizeInPixelsX, + AF1 outputSizeInPixelsY, + // This is the input image offset into the resource containing it (useful for dynamic resolution) + AF1 inputOffsetInPixelsX, + AF1 inputOffsetInPixelsY) { + FsrEasuCon(con0, con1, con2, con3, inputViewportInPixelsX, inputViewportInPixelsY, inputSizeInPixelsX, inputSizeInPixelsY, outputSizeInPixelsX, outputSizeInPixelsY); + con0[2] = AU1_AF1(AF1_(0.5) * inputViewportInPixelsX * ARcpF1(outputSizeInPixelsX) - AF1_(0.5) + inputOffsetInPixelsX); + con0[3] = AU1_AF1(AF1_(0.5) * inputViewportInPixelsY * ARcpF1(outputSizeInPixelsY) - AF1_(0.5) + inputOffsetInPixelsY); +} +//////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////// +//////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////// +//_____________________________________________________________/\_______________________________________________________________ +//============================================================================================================================== +// NON-PACKED 32-BIT VERSION +//============================================================================================================================== +#if defined(A_GPU)&&defined(FSR_EASU_F) + // Input callback prototypes, need to be implemented by calling shader + AF4 FsrEasuRF(AF2 p); + AF4 FsrEasuGF(AF2 p); + AF4 FsrEasuBF(AF2 p); +//------------------------------------------------------------------------------------------------------------------------------ + // Filtering for a given tap for the scalar. + void FsrEasuTapF( + inout AF3 aC, // Accumulated color, with negative lobe. + inout AF1 aW, // Accumulated weight. + AF2 off, // Pixel offset from resolve position to tap. + AF2 dir, // Gradient direction. + AF2 len, // Length. + AF1 lob, // Negative lobe strength. + AF1 clp, // Clipping point. + AF3 c){ // Tap color. + // Rotate offset by direction. + AF2 v; + v.x=(off.x*( dir.x))+(off.y*dir.y); + v.y=(off.x*(-dir.y))+(off.y*dir.x); + // Anisotropy. + v*=len; + // Compute distance^2. + AF1 d2=v.x*v.x+v.y*v.y; + // Limit to the window as at corner, 2 taps can easily be outside. + d2=min(d2,clp); + // Approximation of lancos2 without sin() or rcp(), or sqrt() to get x. + // (25/16 * (2/5 * x^2 - 1)^2 - (25/16 - 1)) * (1/4 * x^2 - 1)^2 + // |_______________________________________| |_______________| + // base window + // The general form of the 'base' is, + // (a*(b*x^2-1)^2-(a-1)) + // Where 'a=1/(2*b-b^2)' and 'b' moves around the negative lobe. + AF1 wB=AF1_(2.0/5.0)*d2+AF1_(-1.0); + AF1 wA=lob*d2+AF1_(-1.0); + wB*=wB; + wA*=wA; + wB=AF1_(25.0/16.0)*wB+AF1_(-(25.0/16.0-1.0)); + AF1 w=wB*wA; + // Do weighted average. + aC+=c*w;aW+=w;} +//------------------------------------------------------------------------------------------------------------------------------ + // Accumulate direction and length. + void FsrEasuSetF( + inout AF2 dir, + inout AF1 len, + AF2 pp, + AP1 biS,AP1 biT,AP1 biU,AP1 biV, + AF1 lA,AF1 lB,AF1 lC,AF1 lD,AF1 lE){ + // Compute bilinear weight, branches factor out as predicates are compiler time immediates. + // s t + // u v + AF1 w = AF1_(0.0); + if(biS)w=(AF1_(1.0)-pp.x)*(AF1_(1.0)-pp.y); + if(biT)w= pp.x *(AF1_(1.0)-pp.y); + if(biU)w=(AF1_(1.0)-pp.x)* pp.y ; + if(biV)w= pp.x * pp.y ; + // Direction is the '+' diff. + // a + // b c d + // e + // Then takes magnitude from abs average of both sides of 'c'. + // Length converts gradient reversal to 0, smoothly to non-reversal at 1, shaped, then adding horz and vert terms. + AF1 dc=lD-lC; + AF1 cb=lC-lB; + AF1 lenX=max(abs(dc),abs(cb)); + lenX=APrxLoRcpF1(lenX); + AF1 dirX=lD-lB; + dir.x+=dirX*w; + lenX=ASatF1(abs(dirX)*lenX); + lenX*=lenX; + len+=lenX*w; + // Repeat for the y axis. + AF1 ec=lE-lC; + AF1 ca=lC-lA; + AF1 lenY=max(abs(ec),abs(ca)); + lenY=APrxLoRcpF1(lenY); + AF1 dirY=lE-lA; + dir.y+=dirY*w; + lenY=ASatF1(abs(dirY)*lenY); + lenY*=lenY; + len+=lenY*w;} +//------------------------------------------------------------------------------------------------------------------------------ + void FsrEasuF( + out AF3 pix, + AU2 ip, // Integer pixel position in output. + AU4 con0, // Constants generated by FsrEasuCon(). + AU4 con1, + AU4 con2, + AU4 con3){ +//------------------------------------------------------------------------------------------------------------------------------ + // Get position of 'f'. + AF2 pp=AF2(ip)*AF2_AU2(con0.xy)+AF2_AU2(con0.zw); + AF2 fp=floor(pp); + pp-=fp; +//------------------------------------------------------------------------------------------------------------------------------ + // 12-tap kernel. + // b c + // e f g h + // i j k l + // n o + // Gather 4 ordering. + // a b + // r g + // For packed FP16, need either {rg} or {ab} so using the following setup for gather in all versions, + // a b <- unused (z) + // r g + // a b a b + // r g r g + // a b + // r g <- unused (z) + // Allowing dead-code removal to remove the 'z's. + AF2 p0=fp*AF2_AU2(con1.xy)+AF2_AU2(con1.zw); + // These are from p0 to avoid pulling two constants on pre-Navi hardware. + AF2 p1=p0+AF2_AU2(con2.xy); + AF2 p2=p0+AF2_AU2(con2.zw); + AF2 p3=p0+AF2_AU2(con3.xy); + AF4 bczzR=FsrEasuRF(p0); + AF4 bczzG=FsrEasuGF(p0); + AF4 bczzB=FsrEasuBF(p0); + AF4 ijfeR=FsrEasuRF(p1); + AF4 ijfeG=FsrEasuGF(p1); + AF4 ijfeB=FsrEasuBF(p1); + AF4 klhgR=FsrEasuRF(p2); + AF4 klhgG=FsrEasuGF(p2); + AF4 klhgB=FsrEasuBF(p2); + AF4 zzonR=FsrEasuRF(p3); + AF4 zzonG=FsrEasuGF(p3); + AF4 zzonB=FsrEasuBF(p3); +//------------------------------------------------------------------------------------------------------------------------------ + // Simplest multi-channel approximate luma possible (luma times 2, in 2 FMA/MAD). + AF4 bczzL=bczzB*AF4_(0.5)+(bczzR*AF4_(0.5)+bczzG); + AF4 ijfeL=ijfeB*AF4_(0.5)+(ijfeR*AF4_(0.5)+ijfeG); + AF4 klhgL=klhgB*AF4_(0.5)+(klhgR*AF4_(0.5)+klhgG); + AF4 zzonL=zzonB*AF4_(0.5)+(zzonR*AF4_(0.5)+zzonG); + // Rename. + AF1 bL=bczzL.x; + AF1 cL=bczzL.y; + AF1 iL=ijfeL.x; + AF1 jL=ijfeL.y; + AF1 fL=ijfeL.z; + AF1 eL=ijfeL.w; + AF1 kL=klhgL.x; + AF1 lL=klhgL.y; + AF1 hL=klhgL.z; + AF1 gL=klhgL.w; + AF1 oL=zzonL.z; + AF1 nL=zzonL.w; + // Accumulate for bilinear interpolation. + AF2 dir=AF2_(0.0); + AF1 len=AF1_(0.0); + FsrEasuSetF(dir,len,pp,true, false,false,false,bL,eL,fL,gL,jL); + FsrEasuSetF(dir,len,pp,false,true ,false,false,cL,fL,gL,hL,kL); + FsrEasuSetF(dir,len,pp,false,false,true ,false,fL,iL,jL,kL,nL); + FsrEasuSetF(dir,len,pp,false,false,false,true ,gL,jL,kL,lL,oL); +//------------------------------------------------------------------------------------------------------------------------------ + // Normalize with approximation, and cleanup close to zero. + AF2 dir2=dir*dir; + AF1 dirR=dir2.x+dir2.y; + AP1 zro=dirR w = -m/(n+e+w+s) +// 1 == (w*(n+e+w+s)+m)/(4*w+1) -> w = (1-m)/(n+e+w+s-4*1) +// Then chooses the 'w' which results in no clipping, limits 'w', and multiplies by the 'sharp' amount. +// This solution above has issues with MSAA input as the steps along the gradient cause edge detection issues. +// So RCAS uses 4x the maximum and 4x the minimum (depending on equation)in place of the individual taps. +// As well as switching from 'm' to either the minimum or maximum (depending on side), to help in energy conservation. +// This stabilizes RCAS. +// RCAS does a simple highpass which is normalized against the local contrast then shaped, +// 0.25 +// 0.25 -1 0.25 +// 0.25 +// This is used as a noise detection filter, to reduce the effect of RCAS on grain, and focus on real edges. +// +// GLSL example for the required callbacks : +// +// AH4 FsrRcasLoadH(ASW2 p){return AH4(imageLoad(imgSrc,ASU2(p)));} +// void FsrRcasInputH(inout AH1 r,inout AH1 g,inout AH1 b) +// { +// //do any simple input color conversions here or leave empty if none needed +// } +// +// FsrRcasCon need to be called from the CPU or GPU to set up constants. +// Including a GPU example here, the 'con' value would be stored out to a constant buffer. +// +// AU4 con; +// FsrRcasCon(con, +// 0.0); // The scale is {0.0 := maximum sharpness, to N>0, where N is the number of stops (halving) of the reduction of sharpness}. +// --------------- +// RCAS sharpening supports a CAS-like pass-through alpha via, +// #define FSR_RCAS_PASSTHROUGH_ALPHA 1 +// RCAS also supports a define to enable a more expensive path to avoid some sharpening of noise. +// Would suggest it is better to apply film grain after RCAS sharpening (and after scaling) instead of using this define, +// #define FSR_RCAS_DENOISE 1 +//============================================================================================================================== +// This is set at the limit of providing unnatural results for sharpening. +#define FSR_RCAS_LIMIT (0.25-(1.0/16.0)) +//////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////// +//////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////// +//_____________________________________________________________/\_______________________________________________________________ +//============================================================================================================================== +// CONSTANT SETUP +//============================================================================================================================== +// Call to setup required constant values (works on CPU or GPU). +A_STATIC void FsrRcasCon( +outAU4 con, +// The scale is {0.0 := maximum, to N>0, where N is the number of stops (halving) of the reduction of sharpness}. +AF1 sharpness){ + // Transform from stops to linear value. + sharpness=AExp2F1(-sharpness); + varAF2(hSharp)=initAF2(sharpness,sharpness); + con[0]=AU1_AF1(sharpness); + con[1]=AU1_AH2_AF2(hSharp); + con[2]=0; + con[3]=0;} +//////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////// +//////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////// +//_____________________________________________________________/\_______________________________________________________________ +//============================================================================================================================== +// NON-PACKED 32-BIT VERSION +//============================================================================================================================== +#if defined(A_GPU)&&defined(FSR_RCAS_F) + // Input callback prototypes that need to be implemented by calling shader + AF4 FsrRcasLoadF(ASU2 p); + void FsrRcasInputF(inout AF1 r,inout AF1 g,inout AF1 b); +//------------------------------------------------------------------------------------------------------------------------------ + void FsrRcasF( + out AF1 pixR, // Output values, non-vector so port between RcasFilter() and RcasFilterH() is easy. + out AF1 pixG, + out AF1 pixB, + #ifdef FSR_RCAS_PASSTHROUGH_ALPHA + out AF1 pixA, + #endif + AU2 ip, // Integer pixel position in output. + AU4 con){ // Constant generated by RcasSetup(). + // Algorithm uses minimal 3x3 pixel neighborhood. + // b + // d e f + // h + ASU2 sp=ASU2(ip); + AF3 b=FsrRcasLoadF(sp+ASU2( 0,-1)).rgb; + AF3 d=FsrRcasLoadF(sp+ASU2(-1, 0)).rgb; + #ifdef FSR_RCAS_PASSTHROUGH_ALPHA + AF4 ee=FsrRcasLoadF(sp); + AF3 e=ee.rgb;pixA=ee.a; + #else + AF3 e=FsrRcasLoadF(sp).rgb; + #endif + AF3 f=FsrRcasLoadF(sp+ASU2( 1, 0)).rgb; + AF3 h=FsrRcasLoadF(sp+ASU2( 0, 1)).rgb; + // Rename (32-bit) or regroup (16-bit). + AF1 bR=b.r; + AF1 bG=b.g; + AF1 bB=b.b; + AF1 dR=d.r; + AF1 dG=d.g; + AF1 dB=d.b; + AF1 eR=e.r; + AF1 eG=e.g; + AF1 eB=e.b; + AF1 fR=f.r; + AF1 fG=f.g; + AF1 fB=f.b; + AF1 hR=h.r; + AF1 hG=h.g; + AF1 hB=h.b; + // Run optional input transform. + FsrRcasInputF(bR,bG,bB); + FsrRcasInputF(dR,dG,dB); + FsrRcasInputF(eR,eG,eB); + FsrRcasInputF(fR,fG,fB); + FsrRcasInputF(hR,hG,hB); + // Luma times 2. + AF1 bL=bB*AF1_(0.5)+(bR*AF1_(0.5)+bG); + AF1 dL=dB*AF1_(0.5)+(dR*AF1_(0.5)+dG); + AF1 eL=eB*AF1_(0.5)+(eR*AF1_(0.5)+eG); + AF1 fL=fB*AF1_(0.5)+(fR*AF1_(0.5)+fG); + AF1 hL=hB*AF1_(0.5)+(hR*AF1_(0.5)+hG); + // Noise detection. + AF1 nz=AF1_(0.25)*bL+AF1_(0.25)*dL+AF1_(0.25)*fL+AF1_(0.25)*hL-eL; + nz=ASatF1(abs(nz)*APrxMedRcpF1(AMax3F1(AMax3F1(bL,dL,eL),fL,hL)-AMin3F1(AMin3F1(bL,dL,eL),fL,hL))); + nz=AF1_(-0.5)*nz+AF1_(1.0); + // Min and max of ring. + AF1 mn4R=min(AMin3F1(bR,dR,fR),hR); + AF1 mn4G=min(AMin3F1(bG,dG,fG),hG); + AF1 mn4B=min(AMin3F1(bB,dB,fB),hB); + AF1 mx4R=max(AMax3F1(bR,dR,fR),hR); + AF1 mx4G=max(AMax3F1(bG,dG,fG),hG); + AF1 mx4B=max(AMax3F1(bB,dB,fB),hB); + // Immediate constants for peak range. + AF2 peakC=AF2(1.0,-1.0*4.0); + // Limiters, these need to be high precision RCPs. + AF1 hitMinR=min(mn4R,eR)*ARcpF1(AF1_(4.0)*mx4R); + AF1 hitMinG=min(mn4G,eG)*ARcpF1(AF1_(4.0)*mx4G); + AF1 hitMinB=min(mn4B,eB)*ARcpF1(AF1_(4.0)*mx4B); + AF1 hitMaxR=(peakC.x-max(mx4R,eR))*ARcpF1(AF1_(4.0)*mn4R+peakC.y); + AF1 hitMaxG=(peakC.x-max(mx4G,eG))*ARcpF1(AF1_(4.0)*mn4G+peakC.y); + AF1 hitMaxB=(peakC.x-max(mx4B,eB))*ARcpF1(AF1_(4.0)*mn4B+peakC.y); + AF1 lobeR=max(-hitMinR,hitMaxR); + AF1 lobeG=max(-hitMinG,hitMaxG); + AF1 lobeB=max(-hitMinB,hitMaxB); + AF1 lobe=max(AF1_(-FSR_RCAS_LIMIT),min(AMax3F1(lobeR,lobeG,lobeB),AF1_(0.0)))*AF1_AU1(con.x); + // Apply noise removal. + #ifdef FSR_RCAS_DENOISE + lobe*=nz; + #endif + // Resolve, which needs the medium precision rcp approximation to avoid visible tonality changes. + AF1 rcpL=APrxMedRcpF1(AF1_(4.0)*lobe+AF1_(1.0)); + pixR=(lobe*bR+lobe*dR+lobe*hR+lobe*fR+eR)*rcpL; + pixG=(lobe*bG+lobe*dG+lobe*hG+lobe*fG+eG)*rcpL; + pixB=(lobe*bB+lobe*dB+lobe*hB+lobe*fB+eB)*rcpL; + return;} +#endif +//////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////// +//////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////// +//_____________________________________________________________/\_______________________________________________________________ +//============================================================================================================================== +// NON-PACKED 16-BIT VERSION +//============================================================================================================================== +#if defined(A_GPU)&&defined(A_HALF)&&defined(FSR_RCAS_H) + // Input callback prototypes that need to be implemented by calling shader + AH4 FsrRcasLoadH(ASW2 p); + void FsrRcasInputH(inout AH1 r,inout AH1 g,inout AH1 b); +//------------------------------------------------------------------------------------------------------------------------------ + void FsrRcasH( + out AH1 pixR, // Output values, non-vector so port between RcasFilter() and RcasFilterH() is easy. + out AH1 pixG, + out AH1 pixB, + #ifdef FSR_RCAS_PASSTHROUGH_ALPHA + out AH1 pixA, + #endif + AU2 ip, // Integer pixel position in output. + AU4 con){ // Constant generated by RcasSetup(). + // Sharpening algorithm uses minimal 3x3 pixel neighborhood. + // b + // d e f + // h + ASW2 sp=ASW2(ip); + AH3 b=FsrRcasLoadH(sp+ASW2( 0,-1)).rgb; + AH3 d=FsrRcasLoadH(sp+ASW2(-1, 0)).rgb; + #ifdef FSR_RCAS_PASSTHROUGH_ALPHA + AH4 ee=FsrRcasLoadH(sp); + AH3 e=ee.rgb;pixA=ee.a; + #else + AH3 e=FsrRcasLoadH(sp).rgb; + #endif + AH3 f=FsrRcasLoadH(sp+ASW2( 1, 0)).rgb; + AH3 h=FsrRcasLoadH(sp+ASW2( 0, 1)).rgb; + // Rename (32-bit) or regroup (16-bit). + AH1 bR=b.r; + AH1 bG=b.g; + AH1 bB=b.b; + AH1 dR=d.r; + AH1 dG=d.g; + AH1 dB=d.b; + AH1 eR=e.r; + AH1 eG=e.g; + AH1 eB=e.b; + AH1 fR=f.r; + AH1 fG=f.g; + AH1 fB=f.b; + AH1 hR=h.r; + AH1 hG=h.g; + AH1 hB=h.b; + // Run optional input transform. + FsrRcasInputH(bR,bG,bB); + FsrRcasInputH(dR,dG,dB); + FsrRcasInputH(eR,eG,eB); + FsrRcasInputH(fR,fG,fB); + FsrRcasInputH(hR,hG,hB); + // Luma times 2. + AH1 bL=bB*AH1_(0.5)+(bR*AH1_(0.5)+bG); + AH1 dL=dB*AH1_(0.5)+(dR*AH1_(0.5)+dG); + AH1 eL=eB*AH1_(0.5)+(eR*AH1_(0.5)+eG); + AH1 fL=fB*AH1_(0.5)+(fR*AH1_(0.5)+fG); + AH1 hL=hB*AH1_(0.5)+(hR*AH1_(0.5)+hG); + // Noise detection. + AH1 nz=AH1_(0.25)*bL+AH1_(0.25)*dL+AH1_(0.25)*fL+AH1_(0.25)*hL-eL; + nz=ASatH1(abs(nz)*APrxMedRcpH1(AMax3H1(AMax3H1(bL,dL,eL),fL,hL)-AMin3H1(AMin3H1(bL,dL,eL),fL,hL))); + nz=AH1_(-0.5)*nz+AH1_(1.0); + // Min and max of ring. + AH1 mn4R=min(AMin3H1(bR,dR,fR),hR); + AH1 mn4G=min(AMin3H1(bG,dG,fG),hG); + AH1 mn4B=min(AMin3H1(bB,dB,fB),hB); + AH1 mx4R=max(AMax3H1(bR,dR,fR),hR); + AH1 mx4G=max(AMax3H1(bG,dG,fG),hG); + AH1 mx4B=max(AMax3H1(bB,dB,fB),hB); + // Immediate constants for peak range. + AH2 peakC=AH2(1.0,-1.0*4.0); + // Limiters, these need to be high precision RCPs. + AH1 hitMinR=min(mn4R,eR)*ARcpH1(AH1_(4.0)*mx4R); + AH1 hitMinG=min(mn4G,eG)*ARcpH1(AH1_(4.0)*mx4G); + AH1 hitMinB=min(mn4B,eB)*ARcpH1(AH1_(4.0)*mx4B); + AH1 hitMaxR=(peakC.x-max(mx4R,eR))*ARcpH1(AH1_(4.0)*mn4R+peakC.y); + AH1 hitMaxG=(peakC.x-max(mx4G,eG))*ARcpH1(AH1_(4.0)*mn4G+peakC.y); + AH1 hitMaxB=(peakC.x-max(mx4B,eB))*ARcpH1(AH1_(4.0)*mn4B+peakC.y); + AH1 lobeR=max(-hitMinR,hitMaxR); + AH1 lobeG=max(-hitMinG,hitMaxG); + AH1 lobeB=max(-hitMinB,hitMaxB); + AH1 lobe=max(AH1_(-FSR_RCAS_LIMIT),min(AMax3H1(lobeR,lobeG,lobeB),AH1_(0.0)))*AH2_AU1(con.y).x; + // Apply noise removal. + #ifdef FSR_RCAS_DENOISE + lobe*=nz; + #endif + // Resolve, which needs the medium precision rcp approximation to avoid visible tonality changes. + AH1 rcpL=APrxMedRcpH1(AH1_(4.0)*lobe+AH1_(1.0)); + pixR=(lobe*bR+lobe*dR+lobe*hR+lobe*fR+eR)*rcpL; + pixG=(lobe*bG+lobe*dG+lobe*hG+lobe*fG+eG)*rcpL; + pixB=(lobe*bB+lobe*dB+lobe*hB+lobe*fB+eB)*rcpL;} +#endif +//////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////// +//////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////// +//_____________________________________________________________/\_______________________________________________________________ +//============================================================================================================================== +// PACKED 16-BIT VERSION +//============================================================================================================================== +#if defined(A_GPU)&&defined(A_HALF)&&defined(FSR_RCAS_HX2) + // Input callback prototypes that need to be implemented by the calling shader + AH4 FsrRcasLoadHx2(ASW2 p); + void FsrRcasInputHx2(inout AH2 r,inout AH2 g,inout AH2 b); +//------------------------------------------------------------------------------------------------------------------------------ + // Can be used to convert from packed Structures of Arrays to Arrays of Structures for store. + void FsrRcasDepackHx2(out AH4 pix0,out AH4 pix1,AH2 pixR,AH2 pixG,AH2 pixB){ + #ifdef A_HLSL + // Invoke a slower path for DX only, since it won't allow uninitialized values. + pix0.a=pix1.a=0.0; + #endif + pix0.rgb=AH3(pixR.x,pixG.x,pixB.x); + pix1.rgb=AH3(pixR.y,pixG.y,pixB.y);} +//------------------------------------------------------------------------------------------------------------------------------ + void FsrRcasHx2( + // Output values are for 2 8x8 tiles in a 16x8 region. + // pix.x = left 8x8 tile + // pix.y = right 8x8 tile + // This enables later processing to easily be packed as well. + out AH2 pixR, + out AH2 pixG, + out AH2 pixB, + #ifdef FSR_RCAS_PASSTHROUGH_ALPHA + out AH2 pixA, + #endif + AU2 ip, // Integer pixel position in output. + AU4 con){ // Constant generated by RcasSetup(). + // No scaling algorithm uses minimal 3x3 pixel neighborhood. + ASW2 sp0=ASW2(ip); + AH3 b0=FsrRcasLoadHx2(sp0+ASW2( 0,-1)).rgb; + AH3 d0=FsrRcasLoadHx2(sp0+ASW2(-1, 0)).rgb; + #ifdef FSR_RCAS_PASSTHROUGH_ALPHA + AH4 ee0=FsrRcasLoadHx2(sp0); + AH3 e0=ee0.rgb;pixA.r=ee0.a; + #else + AH3 e0=FsrRcasLoadHx2(sp0).rgb; + #endif + AH3 f0=FsrRcasLoadHx2(sp0+ASW2( 1, 0)).rgb; + AH3 h0=FsrRcasLoadHx2(sp0+ASW2( 0, 1)).rgb; + ASW2 sp1=sp0+ASW2(8,0); + AH3 b1=FsrRcasLoadHx2(sp1+ASW2( 0,-1)).rgb; + AH3 d1=FsrRcasLoadHx2(sp1+ASW2(-1, 0)).rgb; + #ifdef FSR_RCAS_PASSTHROUGH_ALPHA + AH4 ee1=FsrRcasLoadHx2(sp1); + AH3 e1=ee1.rgb;pixA.g=ee1.a; + #else + AH3 e1=FsrRcasLoadHx2(sp1).rgb; + #endif + AH3 f1=FsrRcasLoadHx2(sp1+ASW2( 1, 0)).rgb; + AH3 h1=FsrRcasLoadHx2(sp1+ASW2( 0, 1)).rgb; + // Arrays of Structures to Structures of Arrays conversion. + AH2 bR=AH2(b0.r,b1.r); + AH2 bG=AH2(b0.g,b1.g); + AH2 bB=AH2(b0.b,b1.b); + AH2 dR=AH2(d0.r,d1.r); + AH2 dG=AH2(d0.g,d1.g); + AH2 dB=AH2(d0.b,d1.b); + AH2 eR=AH2(e0.r,e1.r); + AH2 eG=AH2(e0.g,e1.g); + AH2 eB=AH2(e0.b,e1.b); + AH2 fR=AH2(f0.r,f1.r); + AH2 fG=AH2(f0.g,f1.g); + AH2 fB=AH2(f0.b,f1.b); + AH2 hR=AH2(h0.r,h1.r); + AH2 hG=AH2(h0.g,h1.g); + AH2 hB=AH2(h0.b,h1.b); + // Run optional input transform. + FsrRcasInputHx2(bR,bG,bB); + FsrRcasInputHx2(dR,dG,dB); + FsrRcasInputHx2(eR,eG,eB); + FsrRcasInputHx2(fR,fG,fB); + FsrRcasInputHx2(hR,hG,hB); + // Luma times 2. + AH2 bL=bB*AH2_(0.5)+(bR*AH2_(0.5)+bG); + AH2 dL=dB*AH2_(0.5)+(dR*AH2_(0.5)+dG); + AH2 eL=eB*AH2_(0.5)+(eR*AH2_(0.5)+eG); + AH2 fL=fB*AH2_(0.5)+(fR*AH2_(0.5)+fG); + AH2 hL=hB*AH2_(0.5)+(hR*AH2_(0.5)+hG); + // Noise detection. + AH2 nz=AH2_(0.25)*bL+AH2_(0.25)*dL+AH2_(0.25)*fL+AH2_(0.25)*hL-eL; + nz=ASatH2(abs(nz)*APrxMedRcpH2(AMax3H2(AMax3H2(bL,dL,eL),fL,hL)-AMin3H2(AMin3H2(bL,dL,eL),fL,hL))); + nz=AH2_(-0.5)*nz+AH2_(1.0); + // Min and max of ring. + AH2 mn4R=min(AMin3H2(bR,dR,fR),hR); + AH2 mn4G=min(AMin3H2(bG,dG,fG),hG); + AH2 mn4B=min(AMin3H2(bB,dB,fB),hB); + AH2 mx4R=max(AMax3H2(bR,dR,fR),hR); + AH2 mx4G=max(AMax3H2(bG,dG,fG),hG); + AH2 mx4B=max(AMax3H2(bB,dB,fB),hB); + // Immediate constants for peak range. + AH2 peakC=AH2(1.0,-1.0*4.0); + // Limiters, these need to be high precision RCPs. + AH2 hitMinR=min(mn4R,eR)*ARcpH2(AH2_(4.0)*mx4R); + AH2 hitMinG=min(mn4G,eG)*ARcpH2(AH2_(4.0)*mx4G); + AH2 hitMinB=min(mn4B,eB)*ARcpH2(AH2_(4.0)*mx4B); + AH2 hitMaxR=(peakC.x-max(mx4R,eR))*ARcpH2(AH2_(4.0)*mn4R+peakC.y); + AH2 hitMaxG=(peakC.x-max(mx4G,eG))*ARcpH2(AH2_(4.0)*mn4G+peakC.y); + AH2 hitMaxB=(peakC.x-max(mx4B,eB))*ARcpH2(AH2_(4.0)*mn4B+peakC.y); + AH2 lobeR=max(-hitMinR,hitMaxR); + AH2 lobeG=max(-hitMinG,hitMaxG); + AH2 lobeB=max(-hitMinB,hitMaxB); + AH2 lobe=max(AH2_(-FSR_RCAS_LIMIT),min(AMax3H2(lobeR,lobeG,lobeB),AH2_(0.0)))*AH2_(AH2_AU1(con.y).x); + // Apply noise removal. + #ifdef FSR_RCAS_DENOISE + lobe*=nz; + #endif + // Resolve, which needs the medium precision rcp approximation to avoid visible tonality changes. + AH2 rcpL=APrxMedRcpH2(AH2_(4.0)*lobe+AH2_(1.0)); + pixR=(lobe*bR+lobe*dR+lobe*hR+lobe*fR+eR)*rcpL; + pixG=(lobe*bG+lobe*dG+lobe*hG+lobe*fG+eG)*rcpL; + pixB=(lobe*bB+lobe*dB+lobe*hB+lobe*fB+eB)*rcpL;} +#endif +//////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////// +//////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////// +//////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////// +//////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////// +//_____________________________________________________________/\_______________________________________________________________ +//============================================================================================================================== +// +// FSR - [LFGA] LINEAR FILM GRAIN APPLICATOR +// +//------------------------------------------------------------------------------------------------------------------------------ +// Adding output-resolution film grain after scaling is a good way to mask both rendering and scaling artifacts. +// Suggest using tiled blue noise as film grain input, with peak noise frequency set for a specific look and feel. +// The 'Lfga*()' functions provide a convenient way to introduce grain. +// These functions limit grain based on distance to signal limits. +// This is done so that the grain is temporally energy preserving, and thus won't modify image tonality. +// Grain application should be done in a linear colorspace. +// The grain should be temporally changing, but have a temporal sum per pixel that adds to zero (non-biased). +//------------------------------------------------------------------------------------------------------------------------------ +// Usage, +// FsrLfga*( +// color, // In/out linear colorspace color {0 to 1} ranged. +// grain, // Per pixel grain texture value {-0.5 to 0.5} ranged, input is 3-channel to support colored grain. +// amount); // Amount of grain (0 to 1} ranged. +//------------------------------------------------------------------------------------------------------------------------------ +// Example if grain texture is monochrome: 'FsrLfgaF(color,AF3_(grain),amount)' +//============================================================================================================================== +#if defined(A_GPU) + // Maximum grain is the minimum distance to the signal limit. + void FsrLfgaF(inout AF3 c,AF3 t,AF1 a){c+=(t*AF3_(a))*min(AF3_(1.0)-c,c);} +#endif +//============================================================================================================================== +#if defined(A_GPU)&&defined(A_HALF) + // Half precision version (slower). + void FsrLfgaH(inout AH3 c,AH3 t,AH1 a){c+=(t*AH3_(a))*min(AH3_(1.0)-c,c);} +//------------------------------------------------------------------------------------------------------------------------------ + // Packed half precision version (faster). + void FsrLfgaHx2(inout AH2 cR,inout AH2 cG,inout AH2 cB,AH2 tR,AH2 tG,AH2 tB,AH1 a){ + cR+=(tR*AH2_(a))*min(AH2_(1.0)-cR,cR);cG+=(tG*AH2_(a))*min(AH2_(1.0)-cG,cG);cB+=(tB*AH2_(a))*min(AH2_(1.0)-cB,cB);} +#endif +//////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////// +//////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////// +//////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////// +//////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////// +//_____________________________________________________________/\_______________________________________________________________ +//============================================================================================================================== +// +// FSR - [SRTM] SIMPLE REVERSIBLE TONE-MAPPER +// +//------------------------------------------------------------------------------------------------------------------------------ +// This provides a way to take linear HDR color {0 to FP16_MAX} and convert it into a temporary {0 to 1} ranged post-tonemapped linear. +// The tonemapper preserves RGB ratio, which helps maintain HDR color bleed during filtering. +//------------------------------------------------------------------------------------------------------------------------------ +// Reversible tonemapper usage, +// FsrSrtm*(color); // {0 to FP16_MAX} converted to {0 to 1}. +// FsrSrtmInv*(color); // {0 to 1} converted into {0 to 32768, output peak safe for FP16}. +//============================================================================================================================== +#if defined(A_GPU) + void FsrSrtmF(inout AF3 c){c*=AF3_(ARcpF1(AMax3F1(c.r,c.g,c.b)+AF1_(1.0)));} + // The extra max solves the c=1.0 case (which is a /0). + void FsrSrtmInvF(inout AF3 c){c*=AF3_(ARcpF1(max(AF1_(1.0/32768.0),AF1_(1.0)-AMax3F1(c.r,c.g,c.b))));} +#endif +//============================================================================================================================== +#if defined(A_GPU)&&defined(A_HALF) + void FsrSrtmH(inout AH3 c){c*=AH3_(ARcpH1(AMax3H1(c.r,c.g,c.b)+AH1_(1.0)));} + void FsrSrtmInvH(inout AH3 c){c*=AH3_(ARcpH1(max(AH1_(1.0/32768.0),AH1_(1.0)-AMax3H1(c.r,c.g,c.b))));} +//------------------------------------------------------------------------------------------------------------------------------ + void FsrSrtmHx2(inout AH2 cR,inout AH2 cG,inout AH2 cB){ + AH2 rcp=ARcpH2(AMax3H2(cR,cG,cB)+AH2_(1.0));cR*=rcp;cG*=rcp;cB*=rcp;} + void FsrSrtmInvHx2(inout AH2 cR,inout AH2 cG,inout AH2 cB){ + AH2 rcp=ARcpH2(max(AH2_(1.0/32768.0),AH2_(1.0)-AMax3H2(cR,cG,cB)));cR*=rcp;cG*=rcp;cB*=rcp;} +#endif +//////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////// +//////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////// +//////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////// +//////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////// +//_____________________________________________________________/\_______________________________________________________________ +//============================================================================================================================== +// +// FSR - [TEPD] TEMPORAL ENERGY PRESERVING DITHER +// +//------------------------------------------------------------------------------------------------------------------------------ +// Temporally energy preserving dithered {0 to 1} linear to gamma 2.0 conversion. +// Gamma 2.0 is used so that the conversion back to linear is just to square the color. +// The conversion comes in 8-bit and 10-bit modes, designed for output to 8-bit UNORM or 10:10:10:2 respectively. +// Given good non-biased temporal blue noise as dither input, +// the output dither will temporally conserve energy. +// This is done by choosing the linear nearest step point instead of perceptual nearest. +// See code below for details. +//------------------------------------------------------------------------------------------------------------------------------ +// DX SPEC RULES FOR FLOAT->UNORM 8-BIT CONVERSION +// =============================================== +// - Output is 'uint(floor(saturate(n)*255.0+0.5))'. +// - Thus rounding is to nearest. +// - NaN gets converted to zero. +// - INF is clamped to {0.0 to 1.0}. +//============================================================================================================================== +#if defined(A_GPU) + // Hand tuned integer position to dither value, with more values than simple checkerboard. + // Only 32-bit has enough precision for this compddation. + // Output is {0 to <1}. + AF1 FsrTepdDitF(AU2 p,AU1 f){ + AF1 x=AF1_(p.x+f); + AF1 y=AF1_(p.y); + // The 1.61803 golden ratio. + AF1 a=AF1_((1.0+sqrt(5.0))/2.0); + // Number designed to provide a good visual pattern. + AF1 b=AF1_(1.0/3.69); + x=x*a+(y*b); + return AFractF1(x);} +//------------------------------------------------------------------------------------------------------------------------------ + // This version is 8-bit gamma 2.0. + // The 'c' input is {0 to 1}. + // Output is {0 to 1} ready for image store. + void FsrTepdC8F(inout AF3 c,AF1 dit){ + AF3 n=sqrt(c); + n=floor(n*AF3_(255.0))*AF3_(1.0/255.0); + AF3 a=n*n; + AF3 b=n+AF3_(1.0/255.0);b=b*b; + // Ratio of 'a' to 'b' required to produce 'c'. + // APrxLoRcpF1() won't work here (at least for very high dynamic ranges). + // APrxMedRcpF1() is an IADD,FMA,MUL. + AF3 r=(c-b)*APrxMedRcpF3(a-b); + // Use the ratio as a cutoff to choose 'a' or 'b'. + // AGtZeroF1() is a MUL. + c=ASatF3(n+AGtZeroF3(AF3_(dit)-r)*AF3_(1.0/255.0));} +//------------------------------------------------------------------------------------------------------------------------------ + // This version is 10-bit gamma 2.0. + // The 'c' input is {0 to 1}. + // Output is {0 to 1} ready for image store. + void FsrTepdC10F(inout AF3 c,AF1 dit){ + AF3 n=sqrt(c); + n=floor(n*AF3_(1023.0))*AF3_(1.0/1023.0); + AF3 a=n*n; + AF3 b=n+AF3_(1.0/1023.0);b=b*b; + AF3 r=(c-b)*APrxMedRcpF3(a-b); + c=ASatF3(n+AGtZeroF3(AF3_(dit)-r)*AF3_(1.0/1023.0));} +#endif +//============================================================================================================================== +#if defined(A_GPU)&&defined(A_HALF) + AH1 FsrTepdDitH(AU2 p,AU1 f){ + AF1 x=AF1_(p.x+f); + AF1 y=AF1_(p.y); + AF1 a=AF1_((1.0+sqrt(5.0))/2.0); + AF1 b=AF1_(1.0/3.69); + x=x*a+(y*b); + return AH1(AFractF1(x));} +//------------------------------------------------------------------------------------------------------------------------------ + void FsrTepdC8H(inout AH3 c,AH1 dit){ + AH3 n=sqrt(c); + n=floor(n*AH3_(255.0))*AH3_(1.0/255.0); + AH3 a=n*n; + AH3 b=n+AH3_(1.0/255.0);b=b*b; + AH3 r=(c-b)*APrxMedRcpH3(a-b); + c=ASatH3(n+AGtZeroH3(AH3_(dit)-r)*AH3_(1.0/255.0));} +//------------------------------------------------------------------------------------------------------------------------------ + void FsrTepdC10H(inout AH3 c,AH1 dit){ + AH3 n=sqrt(c); + n=floor(n*AH3_(1023.0))*AH3_(1.0/1023.0); + AH3 a=n*n; + AH3 b=n+AH3_(1.0/1023.0);b=b*b; + AH3 r=(c-b)*APrxMedRcpH3(a-b); + c=ASatH3(n+AGtZeroH3(AH3_(dit)-r)*AH3_(1.0/1023.0));} +//============================================================================================================================== + // This computes dither for positions 'p' and 'p+{8,0}'. + AH2 FsrTepdDitHx2(AU2 p,AU1 f){ + AF2 x; + x.x=AF1_(p.x+f); + x.y=x.x+AF1_(8.0); + AF1 y=AF1_(p.y); + AF1 a=AF1_((1.0+sqrt(5.0))/2.0); + AF1 b=AF1_(1.0/3.69); + x=x*AF2_(a)+AF2_(y*b); + return AH2(AFractF2(x));} +//------------------------------------------------------------------------------------------------------------------------------ + void FsrTepdC8Hx2(inout AH2 cR,inout AH2 cG,inout AH2 cB,AH2 dit){ + AH2 nR=sqrt(cR); + AH2 nG=sqrt(cG); + AH2 nB=sqrt(cB); + nR=floor(nR*AH2_(255.0))*AH2_(1.0/255.0); + nG=floor(nG*AH2_(255.0))*AH2_(1.0/255.0); + nB=floor(nB*AH2_(255.0))*AH2_(1.0/255.0); + AH2 aR=nR*nR; + AH2 aG=nG*nG; + AH2 aB=nB*nB; + AH2 bR=nR+AH2_(1.0/255.0);bR=bR*bR; + AH2 bG=nG+AH2_(1.0/255.0);bG=bG*bG; + AH2 bB=nB+AH2_(1.0/255.0);bB=bB*bB; + AH2 rR=(cR-bR)*APrxMedRcpH2(aR-bR); + AH2 rG=(cG-bG)*APrxMedRcpH2(aG-bG); + AH2 rB=(cB-bB)*APrxMedRcpH2(aB-bB); + cR=ASatH2(nR+AGtZeroH2(dit-rR)*AH2_(1.0/255.0)); + cG=ASatH2(nG+AGtZeroH2(dit-rG)*AH2_(1.0/255.0)); + cB=ASatH2(nB+AGtZeroH2(dit-rB)*AH2_(1.0/255.0));} +//------------------------------------------------------------------------------------------------------------------------------ + void FsrTepdC10Hx2(inout AH2 cR,inout AH2 cG,inout AH2 cB,AH2 dit){ + AH2 nR=sqrt(cR); + AH2 nG=sqrt(cG); + AH2 nB=sqrt(cB); + nR=floor(nR*AH2_(1023.0))*AH2_(1.0/1023.0); + nG=floor(nG*AH2_(1023.0))*AH2_(1.0/1023.0); + nB=floor(nB*AH2_(1023.0))*AH2_(1.0/1023.0); + AH2 aR=nR*nR; + AH2 aG=nG*nG; + AH2 aB=nB*nB; + AH2 bR=nR+AH2_(1.0/1023.0);bR=bR*bR; + AH2 bG=nG+AH2_(1.0/1023.0);bG=bG*bG; + AH2 bB=nB+AH2_(1.0/1023.0);bB=bB*bB; + AH2 rR=(cR-bR)*APrxMedRcpH2(aR-bR); + AH2 rG=(cG-bG)*APrxMedRcpH2(aG-bG); + AH2 rB=(cB-bB)*APrxMedRcpH2(aB-bB); + cR=ASatH2(nR+AGtZeroH2(dit-rR)*AH2_(1.0/1023.0)); + cG=ASatH2(nG+AGtZeroH2(dit-rG)*AH2_(1.0/1023.0)); + cB=ASatH2(nB+AGtZeroH2(dit-rB)*AH2_(1.0/1023.0));} +#endif Property changes on: ps/trunk/binaries/data/mods/mod/shaders/glsl/ffx_fsr1.h ___________________________________________________________________ Added: svn:eol-style ## -0,0 +1 ## +native \ No newline at end of property Index: ps/trunk/binaries/data/mods/mod/shaders/glsl/upscale_bilinear.fs =================================================================== --- ps/trunk/binaries/data/mods/mod/shaders/glsl/upscale_bilinear.fs (nonexistent) +++ ps/trunk/binaries/data/mods/mod/shaders/glsl/upscale_bilinear.fs (revision 28010) @@ -0,0 +1,19 @@ +#version 120 + +#include "common/fragment.h" +#include "common/stage.h" + +BEGIN_DRAW_TEXTURES + TEXTURE_2D(0, inTex) +END_DRAW_TEXTURES + +BEGIN_DRAW_UNIFORMS + UNIFORM(vec4, screenSize) +END_DRAW_UNIFORMS + +VERTEX_OUTPUT(0, vec2, v_tex); + +void main() +{ + OUTPUT_FRAGMENT_SINGLE_COLOR(SAMPLE_2D(GET_DRAW_TEXTURE_2D(inTex), v_tex)); +} Property changes on: ps/trunk/binaries/data/mods/mod/shaders/glsl/upscale_bilinear.fs ___________________________________________________________________ Added: svn:eol-style ## -0,0 +1 ## +native \ No newline at end of property Index: ps/trunk/binaries/data/mods/mod/shaders/glsl/upscale_bilinear.xml =================================================================== --- ps/trunk/binaries/data/mods/mod/shaders/glsl/upscale_bilinear.xml (nonexistent) +++ ps/trunk/binaries/data/mods/mod/shaders/glsl/upscale_bilinear.xml (revision 28010) @@ -0,0 +1,11 @@ + + + + + + + + + + + Property changes on: ps/trunk/binaries/data/mods/mod/shaders/glsl/upscale_bilinear.xml ___________________________________________________________________ Added: svn:eol-style ## -0,0 +1 ## +native \ No newline at end of property Index: ps/trunk/binaries/data/mods/mod/shaders/glsl/upscale_nearest.fs =================================================================== --- ps/trunk/binaries/data/mods/mod/shaders/glsl/upscale_nearest.fs (nonexistent) +++ ps/trunk/binaries/data/mods/mod/shaders/glsl/upscale_nearest.fs (revision 28010) @@ -0,0 +1,19 @@ +#version 130 + +#include "common/fragment.h" +#include "common/stage.h" + +BEGIN_DRAW_TEXTURES + TEXTURE_2D(0, inTex) +END_DRAW_TEXTURES + +BEGIN_DRAW_UNIFORMS + UNIFORM(vec4, screenSize) +END_DRAW_UNIFORMS + +VERTEX_OUTPUT(0, vec2, v_tex); + +void main() +{ + OUTPUT_FRAGMENT_SINGLE_COLOR(texelFetch(GET_DRAW_TEXTURE_2D(inTex), ivec2(v_tex * screenSize.xy), 0)); +} Property changes on: ps/trunk/binaries/data/mods/mod/shaders/glsl/upscale_nearest.fs ___________________________________________________________________ Added: svn:eol-style ## -0,0 +1 ## +native \ No newline at end of property Index: ps/trunk/binaries/data/mods/mod/shaders/glsl/upscale_nearest.xml =================================================================== --- ps/trunk/binaries/data/mods/mod/shaders/glsl/upscale_nearest.xml (nonexistent) +++ ps/trunk/binaries/data/mods/mod/shaders/glsl/upscale_nearest.xml (revision 28010) @@ -0,0 +1,11 @@ + + + + + + + + + + + Property changes on: ps/trunk/binaries/data/mods/mod/shaders/glsl/upscale_nearest.xml ___________________________________________________________________ Added: svn:eol-style ## -0,0 +1 ## +native \ No newline at end of property Index: ps/trunk/binaries/data/mods/mod/shaders/program.rng =================================================================== --- ps/trunk/binaries/data/mods/mod/shaders/program.rng (revision 28009) +++ ps/trunk/binaries/data/mods/mod/shaders/program.rng (revision 28010) @@ -1,115 +1,124 @@ arb glsl - - - - - - - - - - - + + + + + + + + + + + + + + + + + + + + float vec2 vec3 vec4 mat2 mat3 mat4 sampler2D sampler2DShadow samplerCube pos normal color uv0 uv1 uv2 uv3 uv4 uv5 uv6 uv7 Index: ps/trunk/binaries/data/mods/public/gui/options/options.js =================================================================== --- ps/trunk/binaries/data/mods/public/gui/options/options.js (revision 28009) +++ ps/trunk/binaries/data/mods/public/gui/options/options.js (revision 28010) @@ -1,461 +1,465 @@ /** * Translated JSON file contents. */ var g_Options; /** * Names of config keys that have changed, value returned when closing the page. */ var g_ChangedKeys; /** * Vertical size of a tab button. */ var g_TabButtonHeight = 30; /** * Vertical space between two tab buttons. */ var g_TabButtonDist = 5; /** * Vertical distance between the top of the page and the first option. */ var g_OptionControlOffset = 5; /** * Vertical size of each option control. */ var g_OptionControlHeight = 26; /** * Vertical distance between two consecutive options. */ var g_OptionControlDist = 2; /** * Horizontal indentation to distinguish options that depend on another option. */ var g_DependentLabelIndentation = 25; /** * Color used to indicate that the string entered by the player isn't a sane color. */ var g_InsaneColor = "255 0 255"; /** * Defines the parsing of config strings and GUI control interaction for the different option types. * * @property configToValue - parses a string from the user config to a value of the declared type. * @property valueToGui - sets the GUI control to display the given value. * @property guiToValue - returns the value of the GUI control. * @property guiSetter - event name that should be considered a value change of the GUI control. * @property initGUI - sets properties of the GUI control that are independent of the current value. * @property sanitizeValue - Displays a visual clue if the entered value is invalid and returns a sane value. * @property tooltip - appends a custom tooltip to the given option description depending on the current value. */ var g_OptionType = { "boolean": { "configToValue": config => config == "true", "valueToGui": (value, control) => { control.checked = value; }, "guiToValue": control => control.checked, "guiSetter": "onPress" }, "string": { "configToValue": value => value, "valueToGui": (value, control) => { control.caption = value; }, "guiToValue": control => control.caption, "guiSetter": "onTextEdit" }, "color": { "configToValue": value => value, "valueToGui": (value, control) => { control.caption = value; }, "initGUI": (option, control) => { control.children[2].onPress = () => { colorMixer( control.caption, (color) => { if (color != control.caption) { control.caption = color; control.onTextEdit(); } } ); }; }, "guiToValue": control => control.caption, "guiSetter": "onTextEdit", "sanitizeValue": (value, control, option) => { let color = guiToRgbColor(value); let sanitized = rgbToGuiColor(color); if (control) { control.sprite = sanitized == value ? "ModernDarkBoxWhite" : "ModernDarkBoxWhiteInvalid"; control.children[1].sprite = sanitized == value ? "color:" + value : "color:" + g_InsaneColor; } return sanitized; }, "tooltip": (value, option) => sprintf(translate("Default: %(value)s"), { "value": Engine.ConfigDB_GetValue("default", option.config) }) }, "number": { "configToValue": value => value, "valueToGui": (value, control) => { control.caption = value; }, "guiToValue": control => control.caption, "guiSetter": "onTextEdit", "sanitizeValue": (value, control, option) => { let sanitized = Math.min(option.max !== undefined ? option.max : +Infinity, Math.max(option.min !== undefined ? option.min : -Infinity, isNaN(+value) ? 0 : value)); if (control) control.sprite = sanitized == value ? "ModernDarkBoxWhite" : "ModernDarkBoxWhiteInvalid"; return sanitized; }, "tooltip": (value, option) => sprintf( option.min !== undefined && option.max !== undefined ? translateWithContext("option number", "Min: %(min)s, Max: %(max)s") : option.min !== undefined && option.max === undefined ? translateWithContext("option number", "Min: %(min)s") : option.min === undefined && option.max !== undefined ? translateWithContext("option number", "Max: %(max)s") : "", { "min": option.min, "max": option.max }) }, "dropdown": { "configToValue": value => value, "valueToGui": (value, control) => { control.selected = control.list_data.indexOf(value); }, "guiToValue": control => control.list_data[control.selected], "guiSetter": "onSelectionChange", "initGUI": (option, control) => { control.list = option.list.map(e => e.label); control.list_data = option.list.map(e => e.value); control.onHoverChange = () => { let item = option.list[control.hovered]; control.tooltip = item && item.tooltip || option.tooltip; }; } }, "dropdownNumber": { "configToValue": value => +value, "valueToGui": (value, control) => { control.selected = control.list_data.indexOf("" + value); }, "guiToValue": control => +control.list_data[control.selected], "guiSetter": "onSelectionChange", "initGUI": (option, control) => { control.list = option.list.map(e => e.label); control.list_data = option.list.map(e => e.value); control.onHoverChange = () => { const item = option.list[control.hovered]; control.tooltip = item && item.tooltip || option.tooltip; }; }, "timeout": (option, oldValue, hasChanges, newValue) => { if (!option.timeout) return; timedConfirmation( 500, 200, translate("Changes will be reverted in %(time)s seconds. Do you want to keep changes?"), "time", option.timeout, translate("Warning"), [translate("No"), translate("Yes")], [() => {this.revertChange(option, +oldValue, hasChanges);}, null] ); } }, "slider": { "configToValue": value => +value, "valueToGui": (value, control) => { control.value = +value; }, "guiToValue": control => control.value, "guiSetter": "onValueChange", "initGUI": (option, control) => { control.max_value = option.max; control.min_value = option.min; }, "tooltip": (value, option) => sprintf(translateWithContext("slider number", "Value: %(val)s (min: %(min)s, max: %(max)s)"), { "val": value.toFixed(2), "min": option.min.toFixed(2), "max": option.max.toFixed(2) }) } }; function init(data, hotloadData) { g_ChangedKeys = hotloadData ? hotloadData.changedKeys : new Set(); g_TabCategorySelected = hotloadData ? hotloadData.tabCategorySelected : 0; g_Options = Engine.ReadJSONFile("gui/options/options.json"); translateObjectKeys(g_Options, ["label", "tooltip"]); deepfreeze(g_Options); placeTabButtons( g_Options, false, g_TabButtonHeight, g_TabButtonDist, selectPanel, displayOptions); } function getHotloadData() { return { "tabCategorySelected": g_TabCategorySelected, "changedKeys": g_ChangedKeys }; } /** * Sets up labels and controls of all options of the currently selected category. */ function displayOptions() { // Hide all controls for (let body of Engine.GetGUIObjectByName("option_controls").children) { body.hidden = true; for (let control of body.children) control.hidden = true; } // Initialize label and control of each option for this category for (let i = 0; i < g_Options[g_TabCategorySelected].options.length; ++i) { // Position vertically let body = Engine.GetGUIObjectByName("option_control[" + i + "]"); let bodySize = body.size; bodySize.top = g_OptionControlOffset + i * (g_OptionControlHeight + g_OptionControlDist); bodySize.bottom = bodySize.top + g_OptionControlHeight; body.size = bodySize; body.hidden = false; // Load option data let option = g_Options[g_TabCategorySelected].options[i]; let optionType = g_OptionType[option.type]; let value = optionType.configToValue(Engine.ConfigDB_GetValue("user", option.config)); // Setup control let control = Engine.GetGUIObjectByName("option_control_" + option.type + "[" + i + "]"); control.tooltip = option.tooltip + (optionType.tooltip ? "\n" + optionType.tooltip(value, option) : ""); control.hidden = false; if (optionType.initGUI) optionType.initGUI(option, control); control[optionType.guiSetter] = function() {}; optionType.valueToGui(value, control); if (optionType.sanitizeValue) optionType.sanitizeValue(value, control, option); control[optionType.guiSetter] = function() { let value = optionType.guiToValue(control); if (optionType.sanitizeValue) optionType.sanitizeValue(value, control, option); const oldValue = optionType.configToValue(Engine.ConfigDB_GetValue("user", option.config)); control.tooltip = option.tooltip + (optionType.tooltip ? "\n" + optionType.tooltip(value, option) : ""); const hasChanges = Engine.ConfigDB_HasChanges("user"); Engine.ConfigDB_CreateValue("user", option.config, String(value)); g_ChangedKeys.add(option.config); fireConfigChangeHandlers(new Set([option.config])); if (option.timeout) optionType.timeout(option, oldValue, hasChanges, value); if (option.function) Engine[option.function](value); enableButtons(); }; // Setup label let label = Engine.GetGUIObjectByName("option_label[" + i + "]"); label.caption = option.label; label.tooltip = option.tooltip; label.hidden = false; let labelSize = label.size; labelSize.left = option.dependencies ? g_DependentLabelIndentation : 0; labelSize.rright = control.size.rleft; label.size = labelSize; } enableButtons(); } /** * Enable exactly the buttons whose dependencies are met. */ function enableButtons() { g_Options[g_TabCategorySelected].options.forEach((option, i) => { const isDependencyMet = dependency => { if (typeof dependency === "string") return Engine.ConfigDB_GetValue("user", dependency) == "true"; else if (typeof dependency === "object") { const availableOps = { "==": (config, value) => config == value, - "!=": (config, value) => config != value + "!=": (config, value) => config != value, + "<": (config, value) => +config < +value, + "<=": (config, value) => +config <= +value, + ">": (config, value) => +config > +value, + ">=": (config, value) => +config >= +value }; const op = availableOps[dependency.op] || availableOps["=="]; return op(Engine.ConfigDB_GetValue("user", dependency.config), dependency.value); } error("Unsupported dependency: " + uneval(dependency)); return false; }; const enabled = !option.dependencies || option.dependencies.every(isDependencyMet); Engine.GetGUIObjectByName("option_label[" + i + "]").enabled = enabled; Engine.GetGUIObjectByName("option_control_" + option.type + "[" + i + "]").enabled = enabled; }); const hasChanges = Engine.ConfigDB_HasChanges("user"); Engine.GetGUIObjectByName("revertChanges").enabled = hasChanges; Engine.GetGUIObjectByName("saveChanges").enabled = hasChanges; } function setDefaults() { messageBox( 500, 200, translate("Resetting the options will erase your saved settings. Do you want to continue?"), translate("Warning"), [translate("No"), translate("Yes")], [null, reallySetDefaults] ); } function reallySetDefaults() { for (let category in g_Options) for (let option of g_Options[category].options) { Engine.ConfigDB_RemoveValue("user", option.config); g_ChangedKeys.add(option.config); } Engine.ConfigDB_SaveChanges("user"); revertChanges(); } function revertChange(option, oldValue, hadChanges) { Engine.ConfigDB_CreateValue("user", option.config, String(oldValue)); if (!hadChanges) Engine.ConfigDB_SetChanges("user", false); if (option.function) Engine[option.function](oldValue); displayOptions(); } function revertChanges() { Engine.ConfigDB_Reload("user"); for (let category in g_Options) for (let option of g_Options[category].options) if (option.function) Engine[option.function]( g_OptionType[option.type].configToValue( Engine.ConfigDB_GetValue("user", option.config))); displayOptions(); } function saveChanges() { for (let category in g_Options) for (let i = 0; i < g_Options[category].options.length; ++i) { let option = g_Options[category].options[i]; let optionType = g_OptionType[option.type]; if (!optionType.sanitizeValue) continue; let value = optionType.configToValue(Engine.ConfigDB_GetValue("user", option.config)); if (value == optionType.sanitizeValue(value, undefined, option)) continue; selectPanel(category); messageBox( 500, 200, translate("Some setting values are invalid! Are you sure you want to save them?"), translate("Warning"), [translate("No"), translate("Yes")], [null, reallySaveChanges] ); return; } reallySaveChanges(); } function reallySaveChanges() { Engine.ConfigDB_SaveChanges("user"); enableButtons(); } /** * Close GUI page and inform the parent GUI page which options changed. **/ function closePage() { if (Engine.ConfigDB_HasChanges("user")) messageBox( 500, 200, translate("You have unsaved changes, do you want to close this window?"), translate("Warning"), [translate("No"), translate("Yes")], [null, closePageWithoutConfirmation]); else closePageWithoutConfirmation(); } function closePageWithoutConfirmation() { Engine.PopGuiPage(g_ChangedKeys); } Index: ps/trunk/binaries/data/mods/public/gui/options/options.json =================================================================== --- ps/trunk/binaries/data/mods/public/gui/options/options.json (revision 28009) +++ ps/trunk/binaries/data/mods/public/gui/options/options.json (revision 28010) @@ -1,800 +1,829 @@ [ { "label": "General", "options": [ { "type": "string", "label": "Player name (single-player)", "tooltip": "How you want to be addressed in single-player matches.", "config": "playername.singleplayer" }, { "type": "string", "label": "Player name (multiplayer)", "tooltip": "How you want to be addressed in multiplayer matches (except lobby).", "config": "playername.multiplayer" }, { "type": "boolean", "label": "Background pause", "tooltip": "Pause single-player games when window loses focus.", "config": "pauseonfocusloss", "function": "PauseOnFocusLoss" }, { "type": "boolean", "label": "Enable welcome screen", "tooltip": "If you disable it, the welcome screen will still appear once, each time a new version is available. You can always launch it from the main menu.", "config": "gui.splashscreen.enable" }, { "type": "boolean", "label": "FPS overlay", "tooltip": "Show frames per second in top right corner.", "config": "overlay.fps" }, { "type": "boolean", "label": "Real time overlay", "tooltip": "Show current system time in top right corner.", "config": "overlay.realtime" }, { "type": "boolean", "label": "Game time overlay", "tooltip": "Show current simulation time in top right corner.", "config": "gui.session.timeelapsedcounter" }, { "type": "boolean", "label": "Ceasefire time overlay", "tooltip": "Always show the remaining ceasefire time.", "config": "gui.session.ceasefirecounter" }, { "type": "boolean", "label": "Chat timestamp", "tooltip": "Display the time at which a chat message was posted.", "config": "chat.timestamp" }, { "type": "dropdown", "label": "Naming of entities.", "tooltip": "How to show entity names.", "config": "gui.session.howtoshownames", "list": [ { "value": 0, "label": "Specific names first", "tooltip": "Display specific names before generic names." }, { "value": 1, "label": "Generic names first", "tooltip": "Display generic names before specific names." }, { "value": 2, "label": "Only specific names", "tooltip": "Display only specific names for entities." }, { "value": 3, "label": "Only generic names", "tooltip": "Display only generic names for entities." } ] } ] }, { "label": "Graphics (general)", "tooltip": "Set the balance between performance and visual appearance.", "options": [ { "type": "boolean", "label": "Windowed mode", "tooltip": "Start 0 A.D. in a window.", "config": "windowed" }, { "type": "boolean", "label": "Fog", "tooltip": "Enable fog.", "config": "fog" }, { "type": "boolean", "label": "Post-processing", "tooltip": "Use screen-space post-processing filters (HDR, Bloom, DOF, etc).", "config": "postproc" }, { + "type": "dropdownNumber", + "label": "Resolution scale", + "tooltip": "A smaller scale makes rendering faster but produces a more blurry picture, a large scale makes rendering slower but produces a better picture.", + "dependencies": ["postproc"], + "config": "renderer.scale", + "list": [ + { "value": 0.5, "label": "50%" }, + { "value": 0.75, "label": "75%" }, + { "value": 0.875, "label": "87.5%" }, + { "value": 1.00, "label": "100%" }, + { "value": 1.25, "label": "125%" }, + { "value": 1.50, "label": "150%" }, + { "value": 1.75, "label": "175%" }, + { "value": 2.00, "label": "200%" } + ] + }, + { + "type": "dropdown", + "label": "Upscale technique", + "tooltip": "Technique defines performance and quality of upscaling process.", + "dependencies": ["postproc", { "config": "renderer.scale", "op": "<", "value": 1.0 }], + "config": "renderer.upscale.technique", + "list": [ + { "value": "fsr", "label": "FidelityFX Super Resolution 1.0", "tooltip": "Advanced upscale technique. For better results, use FSR with antialiasing enabled. Using it with the OpenGL backend may have some issues, consider using Vulkan backend instead." }, + { "value": "bilinear", "label": "Bilinear", "tooltip": "Bilinear upscale technique. Produces a slightly blurry picture depending on the scale." }, + { "value": "pixelated", "label": "Pixelated", "tooltip": "Simplest upscale technique. Used mostly for stylized effect." } + ] + }, + { "type": "boolean", "label": "Shadows", "tooltip": "Enable shadows.", "config": "shadows" }, { "type": "boolean", "label": "Unit silhouettes", "tooltip": "Show outlines of units behind structures.", "config": "silhouettes" }, { "type": "boolean", "label": "Particles", "tooltip": "Enable particles.", "config": "particles" }, { "type": "boolean", "label": "VSync", "tooltip": "Run vertical sync to fix screen tearing. REQUIRES GAME RESTART", "config": "vsync" }, { "type": "slider", "label": "FPS throttling in menus", "tooltip": "To save CPU workload, throttle render frequency in all menus. Set to maximum to disable throttling.", "config": "adaptivefps.menu", "min": 20, "max": 360 }, { "type": "slider", "label": "FPS throttling in games", "tooltip": "To save CPU workload, throttle render frequency in running games. Set to maximum to disable throttling.", "config": "adaptivefps.session", "min": 20, "max": 360 }, { "type": "dropdownNumber", "label": "GUI scale", "timeout": 5000, "tooltip": "GUI scale", "config": "gui.scale", "function": "SetGUIScale", "list": [ { "value": 0.75, "label": "75%" }, { "value": 1.00, "label": "100%" }, { "value": 1.25, "label": "125%" }, { "value": 1.50, "label": "150%" }, { "value": 1.75, "label": "175%" }, { "value": 2.00, "label": "200%" }, { "value": 2.25, "label": "225%" }, { "value": 2.50, "label": "250%" } ] }, { "type": "number", "label": "Mouse drag", "tooltip": "Number of pixels the mouse can move before the action is considered a drag.", "config": "gui.session.dragdelta", "min": "1", "max": "200" }, { "type": "boolean", "label": "Mouse grab in fullscreen", "tooltip": "Constrain mouse in the fullscreen mode to the window boundaries. It's used to avoid mouse going out of a display in case of multiple displays.", "config": "window.mousegrabinfullscreen" }, { "type": "boolean", "label": "Mouse grab in window mode", "tooltip": "Constrain mouse in the window mode to the window boundaries.", "config": "window.mousegrabinwindowmode" } ] }, { "label": "Graphics (advanced)", "tooltip": "More specific rendering settings.", "options": [ { "type": "dropdown", "label": "Renderer backend", "tooltip": "Choose the renderer's backend. REQUIRES GAME RESTART", "config": "rendererbackend", "list": [ { "value": "gl", "label": "OpenGL", "tooltip": "Default OpenGL backend with GLSL. REQUIRES GAME RESTART" }, { "value": "glarb", "label": "OpenGL ARB", "tooltip": "Legacy OpenGL backend with ARB shaders. REQUIRES GAME RESTART" }, { "value": "vulkan", "label": "Vulkan", "tooltip": "Modern API, requires up-to-date drivers. REQUIRES GAME RESTART" } ] }, { "type": "boolean", "label": "Fog", "tooltip": "Enable fog.", "dependencies": [{ "config": "rendererbackend", "op": "!=", "value": "glarb" }], "config": "fog" }, { "type": "boolean", "label": "Post-processing", "tooltip": "Use screen-space post-processing filters (HDR, Bloom, DOF, etc).", "config": "postproc" }, { "type": "dropdown", "label": "Antialiasing", "tooltip": "Reduce aliasing effect on edges.", "dependencies": ["postproc", { "config": "rendererbackend", "op": "!=", "value": "glarb" }], "config": "antialiasing", "list": [ { "value": "disabled", "label": "Disabled", "tooltip": "Do not use antialiasing." }, { "value": "fxaa", "label": "FXAA", "tooltip": "Fast, but simple antialiasing." }, { "value": "msaa2", "label": "MSAA (2×)", "tooltip": "Slow, but high-quality antialiasing, uses two samples per pixel. Supported for GL3.3+." }, { "value": "msaa4", "label": "MSAA (4×)", "tooltip": "Slow, but high-quality antialiasing, uses four samples per pixel. Supported for GL3.3+." }, { "value": "msaa8", "label": "MSAA (8×)", "tooltip": "Slow, but high-quality antialiasing, uses eight samples per pixel. Supported for GL3.3+." }, { "value": "msaa16", "label": "MSAA (16×)", "tooltip": "Slow, but high-quality antialiasing, uses sixteen samples per pixel. Supported for GL3.3+." } ] }, { "type": "dropdown", "label": "Sharpening", "tooltip": "Reduce blurry effects.", "dependencies": ["postproc", { "config": "rendererbackend", "op": "!=", "value": "glarb" }], "config": "sharpening", "list": [ { "value": "disabled", "label": "Disabled", "tooltip": "Do not use sharpening." }, { "value": "cas", "label": "FidelityFX CAS", "tooltip": "Contrast adaptive sharpening, a fast, contrast based sharpening pass." } ] }, { "type": "slider", "label": "Sharpness factor", "tooltip": "The sharpness of the choosen pass.", "dependencies": [ "postproc", { "config": "rendererbackend", "op": "!=", "value": "glarb" }, { "config": "sharpening", "op": "!=", "value": "disabled" } ], "config": "sharpness", "min": 0, "max": 1 }, { "type": "dropdown", "label": "Model quality", "tooltip": "Model quality setting.", "config": "max_actor_quality", "list": [ { "value": 100, "label": { "_string": "Low", "context": "Option for the meshes' level of detail." }, "tooltip": "Simpler models for better performance." }, { "value": 150, "label": { "_string": "Medium", "context": "Option for the meshes' level of detail." }, "tooltip": "Average quality and average performance." }, { "value": 200, "label": { "_string": "High", "context": "Option for the meshes' level of detail." }, "tooltip": "High quality models." } ] }, { "type": "dropdown", "label": "Model appearance randomization", "tooltip": "Randomize the appearance of entities. Disabling gives a small performance improvement.", "config": "variant_diversity", "list": [ { "value": "none", "label": { "_string": "None", "context": "Option for the meshes' amount of variety." }, "tooltip": "Entities will all look the same." }, { "value": "limited", "label": { "_string": "Limited", "context": "Option for the meshes' amount of variety." }, "tooltip": "Entities will be less diverse." }, { "value": "full", "label": { "_string": "Normal", "context": "Option for the meshes' amount of variety." }, "tooltip": "Entities appearance is randomized normally." } ] }, { "type": "slider", "label": "Shader effects", "tooltip": "Number of shader effects. REQUIRES GAME RESTART", "config": "materialmgr.quality", "min": 0, "max": 10 }, { "type": "boolean", "label": "Shadows", "tooltip": "Enable shadows.", "config": "shadows" }, { "type": "dropdown", "label": "Quality", "tooltip": "Shadow map resolution. High values can crash the game when using a graphics card with low memory!", "dependencies": ["shadows"], "config": "shadowquality", "list": [ { "value": -1, "label": { "_string": "Low", "context": "Option for the shadow quality." } }, { "value": 0, "label": { "_string": "Medium", "context": "Option for the shadow quality." } }, { "value": 1, "label": { "_string": "High", "context": "Option for the shadow quality." } }, { "value": 2, "label": { "_string": "Very High", "context": "Option for the shadow quality." } } ] }, { "type": "boolean", "label": "Filtering", "tooltip": "Smooth shadows.", "dependencies": ["shadows"], "config": "shadowpcf" }, { "type": "slider", "label": "Cutoff distance", "tooltip": "Hides shadows beyond a certain distance from a camera.", "dependencies": ["shadows"], "config": "shadowscutoffdistance", "min": 100, "max": 1500 }, { "type": "boolean", "label": "Cover whole map", "tooltip": "When ON shadows cover the whole map and shadows cutoff distance is ignored. Useful for making screenshots of a whole map.", "dependencies": ["shadows"], "config": "shadowscovermap" }, { "type": "boolean", "label": "Water effects", "tooltip": "When OFF, use the lowest settings possible to render water. This makes other settings irrelevant.", "config": "watereffects" }, { "type": "boolean", "label": "High-quality water effects", "tooltip": "Use higher-quality effects for water, rendering coastal waves, shore foam, and ships trails.", "dependencies": ["watereffects"], "config": "waterfancyeffects" }, { "type": "boolean", "label": "Water reflections", "tooltip": "Allow water to reflect a mirror image.", "dependencies": ["watereffects"], "config": "waterreflection" }, { "type": "boolean", "label": "Water refraction", "tooltip": "Use a real water refraction map and not transparency.", "dependencies": ["watereffects"], "config": "waterrefraction" }, { "type": "boolean", "label": "Real water depth", "tooltip": "Use actual water depth in rendering calculations.", "dependencies": ["watereffects", "waterrefraction"], "config": "waterrealdepth" }, { "type": "dropdown", "label": "Texture quality", "tooltip": "Decrease texture quality making them blurrier but increases game performance.", "config": "textures.quality", "list": [ { "value": 0, "label": { "_string": "Low", "context": "Option for the texture quality." }, "tooltip": "Low" }, { "value": 1, "label": { "_string": "Medium", "context": "Option for the texture quality." }, "tooltip": "Medium" }, { "value": 2, "label": { "_string": "High", "context": "Option for the texture quality." }, "tooltip": "High" } ] }, { "type": "dropdown", "label": "Texture anisotropic filter", "tooltip": "Makes textures look better, especially terrain. If the anisotropic filter value is unsupported it will be set to the max supported value.", "config": "textures.maxanisotropy", "list": [ { "value": 1, "label": "1x", "tooltip": "Disabled" }, { "value": 2, "label": "2x", "tooltip": "2x" }, { "value": 4, "label": "4x", "tooltip": "4x" }, { "value": 8, "label": "8x", "tooltip": "8x" }, { "value": 16, "label": "16x", "tooltip": "16x" } ] } ] }, { "label": "Sound", "options": [ { "type": "slider", "label": "Master volume", "tooltip": "Master audio gain.", "config": "sound.mastergain", "function": "SetMasterGain", "min": 0, "max": 2 }, { "type": "slider", "label": "Music volume", "tooltip": "In game music gain.", "config": "sound.musicgain", "function": "SetMusicGain", "min": 0, "max": 2 }, { "type": "slider", "label": "Ambient volume", "tooltip": "In game ambient sound gain.", "config": "sound.ambientgain", "function": "SetAmbientGain", "min": 0, "max": 2 }, { "type": "slider", "label": "Action volume", "tooltip": "In game unit action sound gain.", "config": "sound.actiongain", "function": "SetActionGain", "min": 0, "max": 2 }, { "type": "slider", "label": "UI volume", "tooltip": "UI sound gain.", "config": "sound.uigain", "function": "SetUIGain", "min": 0, "max": 2 }, { "type": "boolean", "label": "Nick notification", "tooltip": "Receive audio notification when someone types your nick.", "config": "sound.notify.nick" }, { "type": "boolean", "label": "New player notification in game setup", "tooltip": "Receive audio notification when a new client joins the game setup.", "config": "sound.notify.gamesetup.join" } ] }, { "label": "Game Setup", "options": [ { "type": "boolean", "label": "Enable game setting tips", "tooltip": "Show tips when setting up a game.", "config": "gui.gamesetup.enabletips" }, { "type": "boolean", "label": "Enable settings panel slide", "tooltip": "Slide the settings panel when opening, closing or resizing.", "config": "gui.gamesetup.settingsslide" }, { "type": "boolean", "label": "Persist match settings", "tooltip": "Save and restore match settings for quick reuse when hosting another game.", "config": "persistmatchsettings" }, { "type": "dropdown", "label": "Default AI difficulty", "tooltip": "Default difficulty of the AI.", "config": "gui.gamesetup.aidifficulty", "list": [ { "value": 0, "label": { "_string": "Sandbox", "context": "Option for the AI difficulty." }}, { "value": 1, "label": { "_string": "Very Easy", "context": "Option for the AI difficulty." }}, { "value": 2, "label": { "_string": "Easy", "context": "Option for the AI difficulty." }}, { "value": 3, "label": { "_string": "Medium", "context": "Option for the AI difficulty." }}, { "value": 4, "label": { "_string": "Hard", "context": "Option for the AI difficulty." }}, { "value": 5, "label": { "_string": "Very Hard", "context": "Option for the AI difficulty." }} ] }, { "type": "dropdown", "label": "Default AI behavior", "tooltip": "Default behavior of the AI.", "config": "gui.gamesetup.aibehavior", "list": [ { "value": "random", "label": "Random" }, { "value": "balanced", "label": "Balanced" }, { "value": "aggressive", "label": "Aggressive" }, { "value": "defensive", "label": "Defensive" } ] }, { "type": "dropdown", "label": "Assign players", "tooltip": "Automatically assign joining clients to free player slots during the match setup.", "config": "gui.gamesetup.assignplayers", "list": [ { "value": "everyone", "label": "Everyone", "tooltip": "Players joining the match will be assigned if there is a free slot." }, { "value": "buddies", "label": "Buddies", "tooltip": "Players joining the match will only be assigned if they are a buddy of the host and if there is a free slot." }, { "value": "disabled", "label": "Disabled", "tooltip": "Players only receive a slot when the host assigns them explicitly." } ] } ] }, { "label": "Networking / Lobby", "tooltip": "These settings only affect the multiplayer.", "options": [ { "type": "boolean", "label": "TLS encryption", "tooltip": "Protect login and data exchanged with the lobby server using TLS encryption.", "config": "lobby.tls" }, { "type": "number", "label": "Chat backlog", "tooltip": "Number of backlogged messages to load when joining the lobby.", "config": "lobby.history", "min": "0" }, { "type": "boolean", "label": "Game rating column", "tooltip": "Show the average rating of the participating players in a column of the gamelist.", "config": "lobby.columns.gamerating" }, { "type": "boolean", "label": "Network warnings", "tooltip": "Show which player has a bad connection in multiplayer games.", "config": "overlay.netwarnings" }, { "type": "dropdown", "label": "Late observer joins", "tooltip": "Allow everybody or buddies only to join the game as observer after it started.", "config": "network.lateobservers", "list": [ { "value": "everyone", "label": "Everyone" }, { "value": "buddies", "label": "Buddies" }, { "value": "disabled", "label": "Disabled" } ] }, { "type": "number", "label": "Observer limit", "tooltip": "Prevent further observers from joining if the limit is reached.", "config": "network.observerlimit", "min": 0, "max": 32 }, { "type": "number", "label": "Max lag for observers", "tooltip": "When hosting, pause the game if observers are lagging more than this many turns. If set to -1, observers are ignored.", "config": "network.observermaxlag", "min": -1, "max": 10000 }, { "type": "boolean", "label": "(Observer) Speed up when lagging.", "tooltip": "When observing a game, automatically speed up if you start lagging, to catch up with the live match.", "config": "network.autocatchup" } ] }, { "label": "Game Session", "tooltip": "Change options regarding the in-game settings.", "options": [ { "type": "slider", "label": "Wounded unit health", "tooltip": "The wounded unit hotkey considers the selected units as wounded if their health percentage falls below this number.", "config": "gui.session.woundedunithotkeythreshold", "min": 0, "max": 100 }, { "type": "number", "label": "Batch training size", "tooltip": "Number of units trained per batch by default.", "config": "gui.session.batchtrainingsize", "min": 1, "max": 20 }, { "type": "slider", "label": "Scroll batch increment ratio", "tooltip": "Number of times you have to scroll to increase/decrease the batchsize by 1.", "config": "gui.session.scrollbatchratio", "min": 0.1, "max": 30 }, { "type": "slider", "label": "Flare display duration", "tooltip": "How long the flare markers on the minimap are displayed in seconds.", "config": "gui.session.flarelifetime", "min": 0, "max": 60 }, { "type": "boolean", "label": "Minimap icons", "tooltip": "Show special icons for some entities on the minimap.", "config": "gui.session.minimap.icons.enabled" }, { "type": "boolean", "label": "Chat notification attack", "tooltip": "Show a chat notification if you are attacked by another player.", "config": "gui.session.notifications.attack" }, { "type": "boolean", "label": "Chat notification tribute", "tooltip": "Show a chat notification if an ally tributes resources to another team member if teams are locked, and all tributes in observer mode.", "config": "gui.session.notifications.tribute" }, { "type": "boolean", "label": "Chat notification barter", "tooltip": "Show a chat notification to observers when a player bartered resources.", "config": "gui.session.notifications.barter" }, { "type": "dropdown", "label": "Chat notification phase", "tooltip": "Show a chat notification if you or an ally have started, aborted or completed a new phase, and phases of all players in observer mode.", "config": "gui.session.notifications.phase", "list": [ { "value": "none", "label": "Disable" }, { "value": "completed", "label": "Completed" }, { "value": "all", "label": "All displayed" } ] }, { "type": "boolean", "label": "Attack range visualization", "tooltip": "Display the attack range of selected defensive structures. (It can also be toggled with the hotkey during a game).", "config": "gui.session.attackrange" }, { "type": "boolean", "label": "Aura range visualization", "tooltip": "Display the range of auras of selected units and structures. (It can also be toggled with the hotkey during a game).", "config": "gui.session.aurasrange" }, { "type": "boolean", "label": "Heal range visualization", "tooltip": "Display the healing range of selected units. (It can also be toggled with the hotkey during a game).", "config": "gui.session.healrange" }, { "type": "boolean", "label": "Rank icon above status bar", "tooltip": "Show rank icons above status bars.", "config": "gui.session.rankabovestatusbar" }, { "type": "boolean", "label": "Experience status bar", "tooltip": "Show an experience status bar above each selected unit.", "config": "gui.session.experiencestatusbar" }, { "type": "boolean", "label": "Detailed tooltips", "tooltip": "Show detailed tooltips for trainable units in unit-producing structures.", "config": "showdetailedtooltips" }, { "type": "dropdown", "label": "Sort resources and population tooltip", "tooltip": "Dynamically sort players in the resources and population tooltip by value.", "config": "gui.session.respoptooltipsort", "list": [ { "value": 0, "label": "Unordered" }, { "value": -1, "label": "Ascending" }, { "value": 1, "label": "Descending" } ] }, { "type": "color", "label": "Diplomacy colors: self", "tooltip": "Color of your units when diplomacy colors are enabled.", "config": "gui.session.diplomacycolors.self" }, { "type": "color", "label": "Diplomacy colors: ally", "tooltip": "Color of allies when diplomacy colors are enabled.", "config": "gui.session.diplomacycolors.ally" }, { "type": "color", "label": "Diplomacy colors: neutral", "tooltip": "Color of neutral players when diplomacy colors are enabled.", "config": "gui.session.diplomacycolors.neutral" }, { "type": "color", "label": "Diplomacy colors: enemy", "tooltip": "Color of enemies when diplomacy colors are enabled.", "config": "gui.session.diplomacycolors.enemy" }, { "type": "dropdown", "label": "Snap to edges", "tooltip": "This option allows to align new structures with nearby structures.", "config": "gui.session.snaptoedges", "list": [ { "value": "disabled", "label": "Hotkey to enable snapping", "tooltip": "New structures are aligned with nearby structures while pressing the hotkey." }, { "value": "enabled", "label": "Hotkey to disable snapping", "tooltip": "New structures are aligned with nearby structures unless the hotkey is pressed." } ] }, { "type": "dropdown", "label": "Control group membership", "tooltip": "Decide whether units can be part of multiple control groups.", "config": "gui.session.disjointcontrolgroups", "list": [ { "value": "true", "label": "Single", "tooltip": "When adding a Unit or Structure to a control group, they are removed from other control groups. Use this choice if you want control groups to refer to distinct armies." }, { "value": "false", "label": "Multiple", "tooltip": "Units and Structures can be part of multiple control groups. This is useful to keep control groups for distinct armies and a control group for the entire army simultaneously." } ] }, { "type": "dropdown", "label": "Formation control", "tooltip": "Decide whether formations are enabled for all orders or only 'Walk' and 'Patrol'.", "config": "gui.session.formationwalkonly", "list": [ { "value": "true", "label": "Walk/Patrol Only", "tooltip": "Other orders will disband existing formations." }, { "value": "false", "label": "No override", "tooltip": "Units in formations stay in formations." } ] }, { "type": "boolean", "label": "Battalion-style formations", "tooltip": "Whether formations are selected as a whole.", "config": "gui.session.selectformationasone" } ] } ] Index: ps/trunk/source/graphics/ShaderManager.h =================================================================== --- ps/trunk/source/graphics/ShaderManager.h (revision 28009) +++ ps/trunk/source/graphics/ShaderManager.h (revision 28010) @@ -1,147 +1,148 @@ -/* Copyright (C) 2023 Wildfire Games. +/* Copyright (C) 2024 Wildfire Games. * This file is part of 0 A.D. * * 0 A.D. is free software: you can redistribute it and/or modify * it under the terms of the GNU General Public License as published by * the Free Software Foundation, either version 2 of the License, or * (at your option) any later version. * * 0 A.D. is distributed in the hope that it will be useful, * but WITHOUT ANY WARRANTY; without even the implied warranty of * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the * GNU General Public License for more details. * * You should have received a copy of the GNU General Public License * along with 0 A.D. If not, see . */ #ifndef INCLUDED_SHADERMANAGER #define INCLUDED_SHADERMANAGER #include "graphics/ShaderDefines.h" #include "graphics/ShaderProgram.h" #include "graphics/ShaderTechnique.h" +#include "renderer/backend/IDevice.h" #include "renderer/backend/PipelineState.h" #include #include #include #include /** * Shader manager: loads and caches shader programs. * * For a high-level overview of shaders and materials, see * http://trac.wildfiregames.com/wiki/MaterialSystem */ class CShaderManager { public: CShaderManager(Renderer::Backend::IDevice* device); ~CShaderManager(); /** * Load a shader effect. * Effects can be implemented via many techniques; this returns the best usable technique. * @param name name of effect XML specification (file is loaded from shaders/effects/${name}.xml) * @param defines key/value set of preprocessor definitions * @return loaded technique, or empty technique on error */ CShaderTechniquePtr LoadEffect(CStrIntern name, const CShaderDefines& defines); /** * Load a shader effect, with empty defines. */ CShaderTechniquePtr LoadEffect(CStrIntern name); /** * Load a shader effect with the pipeline state description overwriting. * TODO: we should set all needed states in XML. */ using PipelineStateDescCallback = CShaderTechnique::PipelineStateDescCallback; CShaderTechniquePtr LoadEffect( CStrIntern name, const CShaderDefines& defines, const PipelineStateDescCallback& callback); /** * Returns the number of shader effects that are currently loaded. */ size_t GetNumEffectsLoaded() const; private: struct CacheKey { std::string name; CShaderDefines defines; bool operator<(const CacheKey& k) const { if (name < k.name) return true; if (k.name < name) return false; return defines < k.defines; } }; Renderer::Backend::IDevice* m_Device = nullptr; // A CShaderProgram contains expensive backend state, so we ought to cache it. // The compiled state depends solely on the filename and list of defines, // so we store that in CacheKey. // TODO: is this cache useful when we already have an effect cache? std::map m_ProgramCache; /** * Key for effect cache lookups. * This stores two separate CShaderDefines because the renderer typically * has one set from the rendering context and one set from the material; * by handling both separately here, we avoid the cost of having to merge * the two sets into a single one before doing the cache lookup. */ struct EffectCacheKey { CStrIntern name; CShaderDefines defines; bool operator==(const EffectCacheKey& b) const; }; struct EffectCacheKeyHash { size_t operator()(const EffectCacheKey& key) const; }; using EffectCacheMap = std::unordered_map; EffectCacheMap m_EffectCache; // Store the set of shaders that need to be reloaded when the given file is modified template using HotloadFilesMap = std::unordered_map< VfsPath, std::set, std::owner_less>>>; HotloadFilesMap m_HotloadTechniques; HotloadFilesMap m_HotloadPrograms; /** * Load a shader program. * @param name name of shader XML specification (file is loaded from shaders/${name}.xml) * @param defines key/value set of preprocessor definitions * @return loaded program, or null pointer on error */ CShaderProgramPtr LoadProgram(const CStr& name, const CShaderDefines& defines); bool LoadTechnique(CShaderTechniquePtr& tech); static Status ReloadChangedFileCB(void* param, const VfsPath& path); Status ReloadChangedFile(const VfsPath& path); /** * Associates the file with the technique to be reloaded if the file has changed. */ void AddTechniqueFileDependency(const CShaderTechniquePtr& technique, const VfsPath& path); /** * Associates the file with the program to be reloaded if the file has changed. */ void AddProgramFileDependency(const CShaderProgramPtr& program, const VfsPath& path); }; #endif // INCLUDED_SHADERMANAGER Index: ps/trunk/source/graphics/ShaderTechnique.h =================================================================== --- ps/trunk/source/graphics/ShaderTechnique.h (revision 28009) +++ ps/trunk/source/graphics/ShaderTechnique.h (revision 28010) @@ -1,102 +1,111 @@ -/* Copyright (C) 2022 Wildfire Games. +/* Copyright (C) 2024 Wildfire Games. * This file is part of 0 A.D. * * 0 A.D. is free software: you can redistribute it and/or modify * it under the terms of the GNU General Public License as published by * the Free Software Foundation, either version 2 of the License, or * (at your option) any later version. * * 0 A.D. is distributed in the hope that it will be useful, * but WITHOUT ANY WARRANTY; without even the implied warranty of * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the * GNU General Public License for more details. * * You should have received a copy of the GNU General Public License * along with 0 A.D. If not, see . */ #ifndef INCLUDED_SHADERTECHNIQUE #define INCLUDED_SHADERTECHNIQUE #include "graphics/ShaderDefines.h" #include "graphics/ShaderProgram.h" #include "graphics/ShaderTechniquePtr.h" #include "lib/code_annotation.h" #include "lib/file/vfs/vfs_path.h" #include "renderer/backend/PipelineState.h" #include #include #include /** * Implements a render pass consisting of a pipeline state and a shader, * used by CShaderTechnique. */ class CShaderPass { public: CShaderPass( std::unique_ptr pipelineState, const CShaderProgramPtr& shader); MOVABLE(CShaderPass); const CShaderProgramPtr& GetShaderProgram() const noexcept { return m_Shader; } Renderer::Backend::IGraphicsPipelineState* GetPipelineState() const noexcept { return m_PipelineState.get(); } private: CShaderProgramPtr m_Shader; std::unique_ptr m_PipelineState; }; /** * Implements a render technique consisting of a sequence of passes. * CShaderManager loads these from shader effect XML files. */ class CShaderTechnique { public: using PipelineStateDescCallback = std::function; CShaderTechnique(const VfsPath& path, const CShaderDefines& defines, const PipelineStateDescCallback& callback); void SetPasses(std::vector&& passes); + void SetComputePipelineState( + std::unique_ptr pipelineState, + const CShaderProgramPtr& computeShader); int GetNumPasses() const; Renderer::Backend::IShaderProgram* GetShader(int pass = 0) const; Renderer::Backend::IGraphicsPipelineState* GetGraphicsPipelineState(int pass = 0) const; + Renderer::Backend::IComputePipelineState* + GetComputePipelineState() const; + /** * Whether this technique uses alpha blending that requires objects to be * drawn from furthest to nearest. */ bool GetSortByDistance() const; void SetSortByDistance(bool enable); const VfsPath& GetPath() { return m_Path; } const CShaderDefines& GetShaderDefines() { return m_Defines; } const PipelineStateDescCallback& GetPipelineStateDescCallback() const { return m_PipelineStateDescCallback; }; private: std::vector m_Passes; bool m_SortByDistance = false; // We need additional data to reload the technique. VfsPath m_Path; CShaderDefines m_Defines; PipelineStateDescCallback m_PipelineStateDescCallback; + + std::unique_ptr m_ComputePipelineState; + CShaderProgramPtr m_ComputeShader; }; #endif // INCLUDED_SHADERTECHNIQUE Index: ps/trunk/source/renderer/PostprocManager.cpp =================================================================== --- ps/trunk/source/renderer/PostprocManager.cpp (revision 28009) +++ ps/trunk/source/renderer/PostprocManager.cpp (revision 28010) @@ -1,697 +1,959 @@ -/* Copyright (C) 2023 Wildfire Games. +/* Copyright (C) 2024 Wildfire Games. * This file is part of 0 A.D. * * 0 A.D. is free software: you can redistribute it and/or modify * it under the terms of the GNU General Public License as published by * the Free Software Foundation, either version 2 of the License, or * (at your option) any later version. * * 0 A.D. is distributed in the hope that it will be useful, * but WITHOUT ANY WARRANTY; without even the implied warranty of * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the * GNU General Public License for more details. * * You should have received a copy of the GNU General Public License * along with 0 A.D. If not, see . */ #include "precompiled.h" #include "renderer/PostprocManager.h" #include "graphics/GameView.h" #include "graphics/LightEnv.h" #include "graphics/ShaderManager.h" #include "lib/bits.h" #include "maths/MathUtil.h" #include "ps/ConfigDB.h" #include "ps/CLogger.h" #include "ps/CStrInternStatic.h" #include "ps/Filesystem.h" #include "ps/Game.h" #include "ps/World.h" #include "renderer/backend/IDevice.h" #include "renderer/Renderer.h" #include "renderer/RenderingOptions.h" #include "tools/atlas/GameInterface/GameLoop.h" #include namespace { void DrawFullscreenQuad( Renderer::Backend::IVertexInputLayout* vertexInputLayout, Renderer::Backend::IDeviceCommandContext* deviceCommandContext) { float quadVerts[] = { 1.0f, 1.0f, -1.0f, 1.0f, -1.0f, -1.0f, -1.0f, -1.0f, 1.0f, -1.0f, 1.0f, 1.0f }; const bool flip = deviceCommandContext->GetDevice()->GetBackend() == Renderer::Backend::Backend::VULKAN; const float bottomV = flip ? 1.0 : 0.0f; const float topV = flip ? 0.0f : 1.0f; float quadTex[] = { 1.0f, topV, 0.0f, topV, 0.0f, bottomV, 0.0f, bottomV, 1.0f, bottomV, 1.0f, topV }; deviceCommandContext->SetVertexInputLayout(vertexInputLayout); deviceCommandContext->SetVertexBufferData( 0, quadVerts, std::size(quadVerts) * sizeof(quadVerts[0])); deviceCommandContext->SetVertexBufferData( 1, quadTex, std::size(quadTex) * sizeof(quadTex[0])); deviceCommandContext->Draw(0, 6); } } // anonymous namespace CPostprocManager::CPostprocManager(Renderer::Backend::IDevice* device) : m_Device(device), m_IsInitialized(false), m_PostProcEffect(L"default"), m_WhichBuffer(true), m_Sharpness(0.3f), m_UsingMultisampleBuffer(false), m_MultisampleCount(0) { } CPostprocManager::~CPostprocManager() { Cleanup(); } bool CPostprocManager::IsEnabled() const { const bool isDepthStencilFormatPresent = m_Device->GetPreferredDepthStencilFormat( Renderer::Backend::ITexture::Usage::DEPTH_STENCIL_ATTACHMENT, true, true) != Renderer::Backend::Format::UNDEFINED; return g_RenderingOptions.GetPostProc() && m_Device->GetBackend() != Renderer::Backend::Backend::GL_ARB && isDepthStencilFormatPresent; } void CPostprocManager::Cleanup() { if (!m_IsInitialized) // Only cleanup if previously used return; m_CaptureFramebuffer.reset(); m_PingFramebuffer.reset(); m_PongFramebuffer.reset(); m_ColorTex1.reset(); m_ColorTex2.reset(); m_DepthTex.reset(); for (BlurScale& scale : m_BlurScales) { for (BlurScale::Step& step : scale.steps) { step.framebuffer.reset(); step.texture.reset(); } } } void CPostprocManager::Initialize() { if (m_IsInitialized) return; const std::array attributes{{ {Renderer::Backend::VertexAttributeStream::POSITION, Renderer::Backend::Format::R32G32_SFLOAT, 0, sizeof(float) * 2, Renderer::Backend::VertexAttributeRate::PER_VERTEX, 0}, {Renderer::Backend::VertexAttributeStream::UV0, Renderer::Backend::Format::R32G32_SFLOAT, 0, sizeof(float) * 2, Renderer::Backend::VertexAttributeRate::PER_VERTEX, 1}, }}; m_VertexInputLayout = g_Renderer.GetVertexInputLayout(attributes); const uint32_t maxSamples = m_Device->GetCapabilities().maxSampleCount; const uint32_t possibleSampleCounts[] = {2, 4, 8, 16}; std::copy_if( std::begin(possibleSampleCounts), std::end(possibleSampleCounts), std::back_inserter(m_AllowedSampleCounts), [maxSamples](const uint32_t sampleCount) { return sampleCount <= maxSamples; } ); // The screen size starts out correct and then must be updated with Resize() - m_Width = g_Renderer.GetWidth(); - m_Height = g_Renderer.GetHeight(); + RecalculateSize(g_Renderer.GetWidth(), g_Renderer.GetHeight()); RecreateBuffers(); m_IsInitialized = true; // Once we have initialised the buffers, we can update the techniques. UpdateAntiAliasingTechnique(); UpdateSharpeningTechnique(); UpdateSharpnessFactor(); + CStr upscaleName; + CFG_GET_VAL("renderer.upscale.technique", upscaleName); + SetUpscaleTechnique(upscaleName); // This might happen after the map is loaded and the effect chosen SetPostEffect(m_PostProcEffect); + + if (m_Device->GetCapabilities().computeShaders) + m_DownscaleComputeTech = g_Renderer.GetShaderManager().LoadEffect(CStrIntern("compute_downscale")); } void CPostprocManager::Resize() { - m_Width = g_Renderer.GetWidth(); - m_Height = g_Renderer.GetHeight(); + RecalculateSize(g_Renderer.GetWidth(), g_Renderer.GetHeight()); // If the buffers were intialized, recreate them to the new size. if (m_IsInitialized) RecreateBuffers(); } void CPostprocManager::RecreateBuffers() { Cleanup(); #define GEN_BUFFER_RGBA(name, w, h) \ name = m_Device->CreateTexture2D( \ "PostProc" #name, \ Renderer::Backend::ITexture::Usage::SAMPLED | \ Renderer::Backend::ITexture::Usage::COLOR_ATTACHMENT | \ Renderer::Backend::ITexture::Usage::TRANSFER_SRC | \ Renderer::Backend::ITexture::Usage::TRANSFER_DST, \ Renderer::Backend::Format::R8G8B8A8_UNORM, w, h, \ Renderer::Backend::Sampler::MakeDefaultSampler( \ Renderer::Backend::Sampler::Filter::LINEAR, \ Renderer::Backend::Sampler::AddressMode::CLAMP_TO_EDGE)); // Two fullscreen ping-pong textures. GEN_BUFFER_RGBA(m_ColorTex1, m_Width, m_Height); GEN_BUFFER_RGBA(m_ColorTex2, m_Width, m_Height); + if (m_UnscaledWidth != m_Width && m_Device->GetCapabilities().computeShaders) + { + const uint32_t usage = + Renderer::Backend::ITexture::Usage::TRANSFER_SRC | + Renderer::Backend::ITexture::Usage::COLOR_ATTACHMENT | + Renderer::Backend::ITexture::Usage::SAMPLED | + Renderer::Backend::ITexture::Usage::STORAGE; + m_UnscaledTexture1 = m_Device->CreateTexture2D( + "PostProcUnscaledTexture1", usage, + Renderer::Backend::Format::R8G8B8A8_UNORM, + m_UnscaledWidth, m_UnscaledHeight, + Renderer::Backend::Sampler::MakeDefaultSampler( + Renderer::Backend::Sampler::Filter::LINEAR, + Renderer::Backend::Sampler::AddressMode::CLAMP_TO_EDGE)); + + m_UnscaledTexture2 = m_Device->CreateTexture2D( + "PostProcUnscaledTexture2", usage, + Renderer::Backend::Format::R8G8B8A8_UNORM, m_UnscaledWidth, m_UnscaledHeight, + Renderer::Backend::Sampler::MakeDefaultSampler( + Renderer::Backend::Sampler::Filter::LINEAR, + Renderer::Backend::Sampler::AddressMode::CLAMP_TO_EDGE)); + + Renderer::Backend::SColorAttachment colorAttachment{}; + colorAttachment.clearColor = CColor{0.0f, 0.0f, 0.0f, 0.0f}; + colorAttachment.loadOp = Renderer::Backend::AttachmentLoadOp::LOAD; + colorAttachment.storeOp = Renderer::Backend::AttachmentStoreOp::STORE; + + colorAttachment.texture = m_UnscaledTexture1.get(); + m_UnscaledFramebuffer1 = m_Device->CreateFramebuffer("PostprocUnscaledFramebuffer1", + &colorAttachment, nullptr); + + colorAttachment.texture = m_UnscaledTexture2.get(); + m_UnscaledFramebuffer2 = m_Device->CreateFramebuffer("PostprocUnscaledFramebuffer2", + &colorAttachment, nullptr); + } + // Textures for several blur sizes. It would be possible to reuse // m_BlurTex2b, thus avoiding the need for m_BlurTex4b and m_BlurTex8b, though given // that these are fairly small it's probably not worth complicating the coordinates passed // to the blur helper functions. uint32_t width = m_Width / 2, height = m_Height / 2; for (BlurScale& scale : m_BlurScales) { for (BlurScale::Step& step : scale.steps) { GEN_BUFFER_RGBA(step.texture, width, height); Renderer::Backend::SColorAttachment colorAttachment{}; colorAttachment.texture = step.texture.get(); colorAttachment.loadOp = Renderer::Backend::AttachmentLoadOp::LOAD; colorAttachment.storeOp = Renderer::Backend::AttachmentStoreOp::STORE; colorAttachment.clearColor = CColor{0.0f, 0.0f, 0.0f, 0.0f}; step.framebuffer = m_Device->CreateFramebuffer( "BlurScaleStepFramebuffer", &colorAttachment, nullptr); } width = std::max(1u, width / 2); height = std::max(1u, height / 2); } #undef GEN_BUFFER_RGBA // Allocate the Depth/Stencil texture. m_DepthTex = m_Device->CreateTexture2D("PostProcDepthTexture", Renderer::Backend::ITexture::Usage::SAMPLED | Renderer::Backend::ITexture::Usage::DEPTH_STENCIL_ATTACHMENT, m_Device->GetPreferredDepthStencilFormat( Renderer::Backend::ITexture::Usage::SAMPLED | Renderer::Backend::ITexture::Usage::DEPTH_STENCIL_ATTACHMENT, true, true), m_Width, m_Height, Renderer::Backend::Sampler::MakeDefaultSampler( Renderer::Backend::Sampler::Filter::LINEAR, Renderer::Backend::Sampler::AddressMode::CLAMP_TO_EDGE)); // Set up the framebuffers with some initial textures. Renderer::Backend::SColorAttachment colorAttachment{}; colorAttachment.texture = m_ColorTex1.get(); colorAttachment.loadOp = Renderer::Backend::AttachmentLoadOp::DONT_CARE; colorAttachment.storeOp = Renderer::Backend::AttachmentStoreOp::STORE; colorAttachment.clearColor = CColor{0.0f, 0.0f, 0.0f, 0.0f}; Renderer::Backend::SDepthStencilAttachment depthStencilAttachment{}; depthStencilAttachment.texture = m_DepthTex.get(); depthStencilAttachment.loadOp = Renderer::Backend::AttachmentLoadOp::CLEAR; depthStencilAttachment.storeOp = Renderer::Backend::AttachmentStoreOp::STORE; m_CaptureFramebuffer = m_Device->CreateFramebuffer("PostprocCaptureFramebuffer", &colorAttachment, &depthStencilAttachment); colorAttachment.texture = m_ColorTex1.get(); colorAttachment.loadOp = Renderer::Backend::AttachmentLoadOp::LOAD; colorAttachment.storeOp = Renderer::Backend::AttachmentStoreOp::STORE; m_PingFramebuffer = m_Device->CreateFramebuffer("PostprocPingFramebuffer", &colorAttachment, nullptr); colorAttachment.texture = m_ColorTex2.get(); m_PongFramebuffer = m_Device->CreateFramebuffer("PostprocPongFramebuffer", &colorAttachment, nullptr); if (!m_CaptureFramebuffer || !m_PingFramebuffer || !m_PongFramebuffer) { LOGWARNING("Failed to create postproc framebuffers"); g_RenderingOptions.SetPostProc(false); } if (m_UsingMultisampleBuffer) { DestroyMultisampleBuffer(); CreateMultisampleBuffer(); } } void CPostprocManager::ApplyBlurDownscale2x( Renderer::Backend::IDeviceCommandContext* deviceCommandContext, Renderer::Backend::IFramebuffer* framebuffer, Renderer::Backend::ITexture* inTex, int inWidth, int inHeight) { deviceCommandContext->BeginFramebufferPass(framebuffer); Renderer::Backend::IDeviceCommandContext::Rect viewportRect{}; viewportRect.width = inWidth / 2; viewportRect.height = inHeight / 2; deviceCommandContext->SetViewports(1, &viewportRect); // Get bloom shader with instructions to simply copy texels. CShaderDefines defines; defines.Add(str_BLOOM_NOP, str_1); CShaderTechniquePtr tech = g_Renderer.GetShaderManager().LoadEffect(str_bloom, defines); deviceCommandContext->SetGraphicsPipelineState( tech->GetGraphicsPipelineState()); deviceCommandContext->BeginPass(); Renderer::Backend::IShaderProgram* shader = tech->GetShader(); deviceCommandContext->SetTexture( shader->GetBindingSlot(str_renderedTex), inTex); DrawFullscreenQuad(m_VertexInputLayout, deviceCommandContext); deviceCommandContext->EndPass(); deviceCommandContext->EndFramebufferPass(); } void CPostprocManager::ApplyBlurGauss( Renderer::Backend::IDeviceCommandContext* deviceCommandContext, Renderer::Backend::ITexture* inTex, Renderer::Backend::ITexture* tempTex, Renderer::Backend::IFramebuffer* tempFramebuffer, Renderer::Backend::IFramebuffer* outFramebuffer, int inWidth, int inHeight) { deviceCommandContext->BeginFramebufferPass(tempFramebuffer); Renderer::Backend::IDeviceCommandContext::Rect viewportRect{}; viewportRect.width = inWidth; viewportRect.height = inHeight; deviceCommandContext->SetViewports(1, &viewportRect); // Get bloom shader, for a horizontal Gaussian blur pass. CShaderDefines defines2; defines2.Add(str_BLOOM_PASS_H, str_1); CShaderTechniquePtr tech = g_Renderer.GetShaderManager().LoadEffect(str_bloom, defines2); deviceCommandContext->SetGraphicsPipelineState( tech->GetGraphicsPipelineState()); deviceCommandContext->BeginPass(); Renderer::Backend::IShaderProgram* shader = tech->GetShader(); deviceCommandContext->SetTexture( shader->GetBindingSlot(str_renderedTex), inTex); deviceCommandContext->SetUniform( shader->GetBindingSlot(str_texSize), inWidth, inHeight); DrawFullscreenQuad(m_VertexInputLayout, deviceCommandContext); deviceCommandContext->EndPass(); deviceCommandContext->EndFramebufferPass(); deviceCommandContext->BeginFramebufferPass(outFramebuffer); deviceCommandContext->SetViewports(1, &viewportRect); // Get bloom shader, for a vertical Gaussian blur pass. CShaderDefines defines3; defines3.Add(str_BLOOM_PASS_V, str_1); tech = g_Renderer.GetShaderManager().LoadEffect(str_bloom, defines3); deviceCommandContext->SetGraphicsPipelineState( tech->GetGraphicsPipelineState()); deviceCommandContext->BeginPass(); shader = tech->GetShader(); // Our input texture to the shader is the output of the horizontal pass. deviceCommandContext->SetTexture( shader->GetBindingSlot(str_renderedTex), tempTex); deviceCommandContext->SetUniform( shader->GetBindingSlot(str_texSize), inWidth, inHeight); DrawFullscreenQuad(m_VertexInputLayout, deviceCommandContext); deviceCommandContext->EndPass(); deviceCommandContext->EndFramebufferPass(); } void CPostprocManager::ApplyBlur( Renderer::Backend::IDeviceCommandContext* deviceCommandContext) { uint32_t width = m_Width, height = m_Height; Renderer::Backend::ITexture* previousTexture = (m_WhichBuffer ? m_ColorTex1 : m_ColorTex2).get(); for (BlurScale& scale : m_BlurScales) { ApplyBlurDownscale2x(deviceCommandContext, scale.steps[0].framebuffer.get(), previousTexture, width, height); width /= 2; height /= 2; ApplyBlurGauss(deviceCommandContext, scale.steps[0].texture.get(), scale.steps[1].texture.get(), scale.steps[1].framebuffer.get(), scale.steps[0].framebuffer.get(), width, height); } } Renderer::Backend::IFramebuffer* CPostprocManager::PrepareAndGetOutputFramebuffer() { ENSURE(m_IsInitialized); - // Leaves m_PingFbo selected for rendering; m_WhichBuffer stays true at this point. + // Leaves m_PingFramebuffer selected for rendering; m_WhichBuffer stays true at this point. m_WhichBuffer = true; return m_UsingMultisampleBuffer ? m_MultisampleFramebuffer.get() : m_CaptureFramebuffer.get(); } +void CPostprocManager::UpscaleTextureByCompute( + Renderer::Backend::IDeviceCommandContext* deviceCommandContext, + CShaderTechnique* shaderTechnique, + Renderer::Backend::ITexture* source, + Renderer::Backend::ITexture* destination) +{ + Renderer::Backend::IShaderProgram* shaderProgram = shaderTechnique->GetShader(); + + const std::array screenSize{{ + static_cast(m_Width), static_cast(m_Height), + static_cast(m_UnscaledWidth), static_cast(m_UnscaledHeight)}}; + + constexpr uint32_t threadGroupWorkRegionDim = 16; + const uint32_t dispatchGroupCountX = DivideRoundUp(m_UnscaledWidth, threadGroupWorkRegionDim); + const uint32_t dispatchGroupCountY = DivideRoundUp(m_UnscaledHeight, threadGroupWorkRegionDim); + + deviceCommandContext->BeginComputePass(); + deviceCommandContext->SetComputePipelineState( + shaderTechnique->GetComputePipelineState()); + deviceCommandContext->SetUniform(shaderProgram->GetBindingSlot(str_screenSize), screenSize); + deviceCommandContext->SetTexture(shaderProgram->GetBindingSlot(str_inTex), source); + deviceCommandContext->SetStorageTexture(shaderProgram->GetBindingSlot(str_outTex), destination); + deviceCommandContext->Dispatch(dispatchGroupCountX, dispatchGroupCountY, 1); + deviceCommandContext->EndComputePass(); +} + +void CPostprocManager::UpscaleTextureByFullscreenQuad( + Renderer::Backend::IDeviceCommandContext* deviceCommandContext, + CShaderTechnique* shaderTechnique, + Renderer::Backend::ITexture* source, + Renderer::Backend::IFramebuffer* destination) +{ + Renderer::Backend::IShaderProgram* shaderProgram = shaderTechnique->GetShader(); + + const std::array screenSize{{ + static_cast(m_Width), static_cast(m_Height), + static_cast(m_UnscaledWidth), static_cast(m_UnscaledHeight)}}; + + deviceCommandContext->BeginFramebufferPass(destination); + + Renderer::Backend::IDeviceCommandContext::Rect viewportRect{}; + viewportRect.width = destination->GetWidth(); + viewportRect.height = destination->GetHeight(); + deviceCommandContext->SetViewports(1, &viewportRect); + + deviceCommandContext->SetGraphicsPipelineState( + shaderTechnique->GetGraphicsPipelineState()); + deviceCommandContext->BeginPass(); + + deviceCommandContext->SetTexture( + shaderProgram->GetBindingSlot(str_inTex), source); + deviceCommandContext->SetUniform(shaderProgram->GetBindingSlot(str_screenSize), screenSize); + + DrawFullscreenQuad(m_VertexInputLayout, deviceCommandContext); + + deviceCommandContext->EndPass(); + deviceCommandContext->EndFramebufferPass(); +} + +void CPostprocManager::ApplySharpnessAfterScale( + Renderer::Backend::IDeviceCommandContext* deviceCommandContext, + CShaderTechnique* shaderTechnique, + Renderer::Backend::ITexture* source, + Renderer::Backend::ITexture* destination) +{ + Renderer::Backend::IShaderProgram* shaderProgram = shaderTechnique->GetShader(); + + // Recommended sharpness for RCAS. + constexpr float sharpness = 0.2f; + + const std::array screenSize{ { + static_cast(m_Width), static_cast(m_Height), + static_cast(m_UnscaledWidth), static_cast(m_UnscaledHeight)} }; + + constexpr uint32_t threadGroupWorkRegionDim = 16; + const uint32_t dispatchGroupCountX = DivideRoundUp(m_UnscaledWidth, threadGroupWorkRegionDim); + const uint32_t dispatchGroupCountY = DivideRoundUp(m_UnscaledHeight, threadGroupWorkRegionDim); + + deviceCommandContext->BeginComputePass(); + deviceCommandContext->SetComputePipelineState( + shaderTechnique->GetComputePipelineState()); + deviceCommandContext->SetUniform(shaderProgram->GetBindingSlot(str_sharpness), sharpness); + deviceCommandContext->SetUniform(shaderProgram->GetBindingSlot(str_screenSize), screenSize); + deviceCommandContext->SetTexture(shaderProgram->GetBindingSlot(str_inTex), source); + deviceCommandContext->SetStorageTexture( + shaderProgram->GetBindingSlot(str_outTex), destination); + deviceCommandContext->Dispatch(dispatchGroupCountX, dispatchGroupCountY, 1); + deviceCommandContext->EndComputePass(); +} + +void CPostprocManager::DownscaleTextureByCompute( + Renderer::Backend::IDeviceCommandContext* deviceCommandContext, + CShaderTechnique* shaderTechnique, + Renderer::Backend::ITexture* source, + Renderer::Backend::ITexture* destination) +{ + Renderer::Backend::IShaderProgram* shaderProgram = shaderTechnique->GetShader(); + + const std::array screenSize{{ + static_cast(m_Width), static_cast(m_Height), + static_cast(m_UnscaledWidth), static_cast(m_UnscaledHeight)}}; + + constexpr uint32_t threadGroupWorkRegionDim = 8; + const uint32_t dispatchGroupCountX = DivideRoundUp(m_UnscaledWidth, threadGroupWorkRegionDim); + const uint32_t dispatchGroupCountY = DivideRoundUp(m_UnscaledHeight, threadGroupWorkRegionDim); + + deviceCommandContext->BeginComputePass(); + deviceCommandContext->SetComputePipelineState( + shaderTechnique->GetComputePipelineState()); + deviceCommandContext->SetUniform(shaderProgram->GetBindingSlot(str_screenSize), screenSize); + deviceCommandContext->SetTexture(shaderProgram->GetBindingSlot(str_inTex), source); + deviceCommandContext->SetStorageTexture(shaderProgram->GetBindingSlot(str_outTex), destination); + deviceCommandContext->Dispatch(dispatchGroupCountX, dispatchGroupCountY, 1); + deviceCommandContext->EndComputePass(); +} + void CPostprocManager::BlitOutputFramebuffer( Renderer::Backend::IDeviceCommandContext* deviceCommandContext, Renderer::Backend::IFramebuffer* destination) { ENSURE(m_IsInitialized); GPU_SCOPED_LABEL(deviceCommandContext, "Copy postproc to backbuffer"); - Renderer::Backend::IFramebuffer* source = - (m_WhichBuffer ? m_PingFramebuffer : m_PongFramebuffer).get(); + Renderer::Backend::ITexture* previousTexture = + (m_WhichBuffer ? m_ColorTex1 : m_ColorTex2).get(); - // We blit to the backbuffer from the previous active buffer. - // We'll have upscaling/downscaling separately. - Renderer::Backend::IDeviceCommandContext::Rect region{}; - region.width = std::min(source->GetWidth(), destination->GetWidth()); - region.height = std::min(source->GetHeight(), destination->GetHeight()); - deviceCommandContext->BlitFramebuffer( - source, destination, region, region, - Renderer::Backend::Sampler::Filter::NEAREST); + if (ShouldUpscale()) + { + if (m_UpscaleComputeTech) + { + Renderer::Backend::ITexture* unscaledTexture = m_RCASComputeTech ? m_UnscaledTexture1.get() : m_UnscaledTexture2.get(); + UpscaleTextureByCompute(deviceCommandContext, m_UpscaleComputeTech.get(), previousTexture, unscaledTexture); + if (m_RCASComputeTech) + ApplySharpnessAfterScale(deviceCommandContext, m_RCASComputeTech.get(), m_UnscaledTexture1.get(), m_UnscaledTexture2.get()); + + Renderer::Backend::IDeviceCommandContext::Rect sourceRegion{}, destinationRegion{}; + sourceRegion.width = m_UnscaledTexture2->GetWidth(); + sourceRegion.height = m_UnscaledTexture2->GetHeight(); + destinationRegion.width = destination->GetWidth(); + destinationRegion.height = destination->GetHeight(); + deviceCommandContext->BlitFramebuffer( + m_UnscaledFramebuffer2.get(), destination, sourceRegion, destinationRegion, + Renderer::Backend::Sampler::Filter::NEAREST); + } + else + { + UpscaleTextureByFullscreenQuad(deviceCommandContext, m_UpscaleTech.get(), previousTexture, destination); + } + } + else if (ShouldDownscale()) + { + Renderer::Backend::IDeviceCommandContext::Rect sourceRegion{}; + Renderer::Backend::Sampler::Filter samplerFilter{ + Renderer::Backend::Sampler::Filter::NEAREST}; + Renderer::Backend::IFramebuffer* source{nullptr}; + + if (m_DownscaleComputeTech) + { + DownscaleTextureByCompute(deviceCommandContext, m_DownscaleComputeTech.get(), previousTexture, m_UnscaledTexture1.get()); + + source = m_UnscaledFramebuffer1.get(); + sourceRegion.width = m_UnscaledTexture1->GetWidth(); + sourceRegion.height = m_UnscaledTexture1->GetHeight(); + } + else + { + source = (m_WhichBuffer ? m_PingFramebuffer : m_PongFramebuffer).get(); + sourceRegion.width = source->GetWidth(); + sourceRegion.height = source->GetHeight(); + samplerFilter = Renderer::Backend::Sampler::Filter::LINEAR; + } + + Renderer::Backend::IDeviceCommandContext::Rect destinationRegion{}; + destinationRegion.width = destination->GetWidth(); + destinationRegion.height = destination->GetHeight(); + deviceCommandContext->BlitFramebuffer( + source, destination, sourceRegion, destinationRegion, samplerFilter); + } + else + { + Renderer::Backend::IFramebuffer* source = + (m_WhichBuffer ? m_PingFramebuffer : m_PongFramebuffer).get(); + + // We blit to the backbuffer from the previous active buffer. + Renderer::Backend::IDeviceCommandContext::Rect region{}; + region.width = std::min(source->GetWidth(), destination->GetWidth()); + region.height = std::min(source->GetHeight(), destination->GetHeight()); + deviceCommandContext->BlitFramebuffer( + source, destination, region, region, + Renderer::Backend::Sampler::Filter::NEAREST); + } } void CPostprocManager::ApplyEffect( Renderer::Backend::IDeviceCommandContext* deviceCommandContext, const CShaderTechniquePtr& shaderTech, int pass) { - // select the other FBO for rendering + // Select the other framebuffer for rendering. Renderer::Backend::IFramebuffer* framebuffer = (m_WhichBuffer ? m_PongFramebuffer : m_PingFramebuffer).get(); deviceCommandContext->BeginFramebufferPass(framebuffer); Renderer::Backend::IDeviceCommandContext::Rect viewportRect{}; viewportRect.width = framebuffer->GetWidth(); viewportRect.height = framebuffer->GetHeight(); deviceCommandContext->SetViewports(1, &viewportRect); deviceCommandContext->SetGraphicsPipelineState( shaderTech->GetGraphicsPipelineState(pass)); deviceCommandContext->BeginPass(); Renderer::Backend::IShaderProgram* shader = shaderTech->GetShader(pass); - // Use the textures from the current FBO as input to the shader. + // Use the textures from the current framebuffer as input to the shader. // We also bind a bunch of other textures and parameters, but since // this only happens once per frame the overhead is negligible. deviceCommandContext->SetTexture( shader->GetBindingSlot(str_renderedTex), m_WhichBuffer ? m_ColorTex1.get() : m_ColorTex2.get()); deviceCommandContext->SetTexture( shader->GetBindingSlot(str_depthTex), m_DepthTex.get()); deviceCommandContext->SetTexture( shader->GetBindingSlot(str_blurTex2), m_BlurScales[0].steps[0].texture.get()); deviceCommandContext->SetTexture( shader->GetBindingSlot(str_blurTex4), m_BlurScales[1].steps[0].texture.get()); deviceCommandContext->SetTexture( shader->GetBindingSlot(str_blurTex8), m_BlurScales[2].steps[0].texture.get()); deviceCommandContext->SetUniform(shader->GetBindingSlot(str_width), m_Width); deviceCommandContext->SetUniform(shader->GetBindingSlot(str_height), m_Height); deviceCommandContext->SetUniform(shader->GetBindingSlot(str_zNear), m_NearPlane); deviceCommandContext->SetUniform(shader->GetBindingSlot(str_zFar), m_FarPlane); deviceCommandContext->SetUniform(shader->GetBindingSlot(str_sharpness), m_Sharpness); deviceCommandContext->SetUniform(shader->GetBindingSlot(str_brightness), g_LightEnv.m_Brightness); deviceCommandContext->SetUniform(shader->GetBindingSlot(str_hdr), g_LightEnv.m_Contrast); deviceCommandContext->SetUniform(shader->GetBindingSlot(str_saturation), g_LightEnv.m_Saturation); deviceCommandContext->SetUniform(shader->GetBindingSlot(str_bloom), g_LightEnv.m_Bloom); DrawFullscreenQuad(m_VertexInputLayout, deviceCommandContext); deviceCommandContext->EndPass(); deviceCommandContext->EndFramebufferPass(); m_WhichBuffer = !m_WhichBuffer; } void CPostprocManager::ApplyPostproc( Renderer::Backend::IDeviceCommandContext* deviceCommandContext) { ENSURE(m_IsInitialized); // Don't do anything if we are using the default effect and no AA. const bool hasEffects = m_PostProcEffect != L"default"; const bool hasARB = m_Device->GetBackend() == Renderer::Backend::Backend::GL_ARB; const bool hasAA = m_AATech && !hasARB; const bool hasSharp = m_SharpTech && !hasARB; if (!hasEffects && !hasAA && !hasSharp) return; GPU_SCOPED_LABEL(deviceCommandContext, "Render postproc"); if (hasEffects) { // First render blur textures. Note that this only happens ONLY ONCE, before any effects are applied! // (This may need to change depending on future usage, however that will have a fps hit) ApplyBlur(deviceCommandContext); for (int pass = 0; pass < m_PostProcTech->GetNumPasses(); ++pass) ApplyEffect(deviceCommandContext, m_PostProcTech, pass); } if (hasAA) { for (int pass = 0; pass < m_AATech->GetNumPasses(); ++pass) ApplyEffect(deviceCommandContext, m_AATech, pass); } - if (hasSharp) + if (hasSharp && !ShouldUpscale()) { for (int pass = 0; pass < m_SharpTech->GetNumPasses(); ++pass) ApplyEffect(deviceCommandContext, m_SharpTech, pass); } } // Generate list of available effect-sets std::vector CPostprocManager::GetPostEffects() { std::vector effects; const VfsPath folder(L"shaders/effects/postproc/"); VfsPaths pathnames; if (vfs::GetPathnames(g_VFS, folder, 0, pathnames) < 0) LOGERROR("Error finding Post effects in '%s'", folder.string8()); for (const VfsPath& path : pathnames) if (path.Extension() == L".xml") effects.push_back(path.Basename().string()); // Add the default "null" effect to the list. effects.push_back(L"default"); sort(effects.begin(), effects.end()); return effects; } void CPostprocManager::SetPostEffect(const CStrW& name) { if (m_IsInitialized) { if (name != L"default") { CStrW n = L"postproc/" + name; m_PostProcTech = g_Renderer.GetShaderManager().LoadEffect(CStrIntern(n.ToUTF8())); } } m_PostProcEffect = name; } void CPostprocManager::UpdateAntiAliasingTechnique() { if (m_Device->GetBackend() == Renderer::Backend::Backend::GL_ARB || !m_IsInitialized) return; CStr newAAName; CFG_GET_VAL("antialiasing", newAAName); if (m_AAName == newAAName) return; m_AAName = newAAName; m_AATech.reset(); if (m_UsingMultisampleBuffer) { m_UsingMultisampleBuffer = false; DestroyMultisampleBuffer(); } // We have to hardcode names in the engine, because anti-aliasing // techinques strongly depend on the graphics pipeline. // We might use enums in future though. constexpr std::string_view msaaPrefix{"msaa"}; if (m_AAName == str_fxaa.string()) { m_AATech = g_Renderer.GetShaderManager().LoadEffect(str_fxaa); } else if (m_AAName.size() > msaaPrefix.size() && std::string_view{m_AAName}.substr(0, msaaPrefix.size()) == msaaPrefix) { // We don't want to enable MSAA in Atlas, because it uses wxWidgets and its canvas. if (g_AtlasGameLoop && g_AtlasGameLoop->running) return; if (!m_Device->GetCapabilities().multisampling || m_AllowedSampleCounts.empty()) { LOGWARNING("MSAA is unsupported."); return; } std::stringstream ss(m_AAName.substr(msaaPrefix.size())); ss >> m_MultisampleCount; if (std::find(std::begin(m_AllowedSampleCounts), std::end(m_AllowedSampleCounts), m_MultisampleCount) == std::end(m_AllowedSampleCounts)) { m_MultisampleCount = std::min(4u, m_Device->GetCapabilities().maxSampleCount); LOGWARNING("Wrong MSAA sample count: %s.", m_AAName.EscapeToPrintableASCII().c_str()); } m_UsingMultisampleBuffer = true; CreateMultisampleBuffer(); } } void CPostprocManager::UpdateSharpeningTechnique() { if (m_Device->GetBackend() == Renderer::Backend::Backend::GL_ARB || !m_IsInitialized) return; CStr newSharpName; CFG_GET_VAL("sharpening", newSharpName); if (m_SharpName == newSharpName) return; m_SharpName = newSharpName; m_SharpTech.reset(); if (m_SharpName == "cas") { m_SharpTech = g_Renderer.GetShaderManager().LoadEffect(CStrIntern(m_SharpName)); } } void CPostprocManager::UpdateSharpnessFactor() { CFG_GET_VAL("sharpness", m_Sharpness); } +void CPostprocManager::SetUpscaleTechnique(const CStr& upscaleName) +{ + m_UpscaleTech.reset(); + m_UpscaleComputeTech.reset(); + m_RCASComputeTech.reset(); + if (m_Device->GetCapabilities().computeShaders && upscaleName == "fsr") + { + m_UpscaleComputeTech = g_Renderer.GetShaderManager().LoadEffect(str_compute_upscale_fsr); + m_RCASComputeTech = g_Renderer.GetShaderManager().LoadEffect(str_compute_rcas); + } + else if (upscaleName == "pixelated") + { + m_UpscaleTech = g_Renderer.GetShaderManager().LoadEffect(str_upscale_nearest); + } + else + { + m_UpscaleTech = g_Renderer.GetShaderManager().LoadEffect(str_upscale_bilinear); + } +} + void CPostprocManager::SetDepthBufferClipPlanes(float nearPlane, float farPlane) { m_NearPlane = nearPlane; m_FarPlane = farPlane; } void CPostprocManager::CreateMultisampleBuffer() { m_MultisampleColorTex = m_Device->CreateTexture("PostProcColorMS", Renderer::Backend::ITexture::Type::TEXTURE_2D_MULTISAMPLE, Renderer::Backend::ITexture::Usage::COLOR_ATTACHMENT | Renderer::Backend::ITexture::Usage::TRANSFER_SRC, Renderer::Backend::Format::R8G8B8A8_UNORM, m_Width, m_Height, Renderer::Backend::Sampler::MakeDefaultSampler( Renderer::Backend::Sampler::Filter::LINEAR, Renderer::Backend::Sampler::AddressMode::CLAMP_TO_EDGE), 1, m_MultisampleCount); // Allocate the Depth/Stencil texture. m_MultisampleDepthTex = m_Device->CreateTexture("PostProcDepthMS", Renderer::Backend::ITexture::Type::TEXTURE_2D_MULTISAMPLE, Renderer::Backend::ITexture::Usage::DEPTH_STENCIL_ATTACHMENT | Renderer::Backend::ITexture::Usage::TRANSFER_SRC, m_Device->GetPreferredDepthStencilFormat( Renderer::Backend::ITexture::Usage::DEPTH_STENCIL_ATTACHMENT | Renderer::Backend::ITexture::Usage::TRANSFER_SRC, true, true), m_Width, m_Height, Renderer::Backend::Sampler::MakeDefaultSampler( Renderer::Backend::Sampler::Filter::LINEAR, Renderer::Backend::Sampler::AddressMode::CLAMP_TO_EDGE), 1, m_MultisampleCount); // Set up the framebuffers with some initial textures. Renderer::Backend::SColorAttachment colorAttachment{}; colorAttachment.texture = m_MultisampleColorTex.get(); colorAttachment.loadOp = Renderer::Backend::AttachmentLoadOp::DONT_CARE; colorAttachment.storeOp = Renderer::Backend::AttachmentStoreOp::STORE; colorAttachment.clearColor = CColor{0.0f, 0.0f, 0.0f, 0.0f}; Renderer::Backend::SDepthStencilAttachment depthStencilAttachment{}; depthStencilAttachment.texture = m_MultisampleDepthTex.get(); depthStencilAttachment.loadOp = Renderer::Backend::AttachmentLoadOp::CLEAR; depthStencilAttachment.storeOp = Renderer::Backend::AttachmentStoreOp::STORE; m_MultisampleFramebuffer = m_Device->CreateFramebuffer( "PostprocMultisampleFramebuffer", &colorAttachment, &depthStencilAttachment); if (!m_MultisampleFramebuffer) { LOGERROR("Failed to create postproc multisample framebuffer"); m_UsingMultisampleBuffer = false; DestroyMultisampleBuffer(); } } void CPostprocManager::DestroyMultisampleBuffer() { if (m_UsingMultisampleBuffer) return; m_MultisampleFramebuffer.reset(); m_MultisampleColorTex.reset(); m_MultisampleDepthTex.reset(); } bool CPostprocManager::IsMultisampleEnabled() const { return m_UsingMultisampleBuffer; } void CPostprocManager::ResolveMultisampleFramebuffer( Renderer::Backend::IDeviceCommandContext* deviceCommandContext) { if (!m_UsingMultisampleBuffer) return; GPU_SCOPED_LABEL(deviceCommandContext, "Resolve postproc multisample"); deviceCommandContext->ResolveFramebuffer( m_MultisampleFramebuffer.get(), m_PingFramebuffer.get()); } + +void CPostprocManager::RecalculateSize(const uint32_t width, const uint32_t height) +{ + if (m_Device->GetBackend() == Renderer::Backend::Backend::GL_ARB) + { + m_Scale = 1.0f; + return; + } + CFG_GET_VAL("renderer.scale", m_Scale); + if (m_Scale < 0.25f || m_Scale > 2.0f) + { + LOGWARNING("Invalid renderer scale: %0.2f", m_Scale); + m_Scale = 1.0f; + } + m_UnscaledWidth = width; + m_UnscaledHeight = height; + m_Width = m_UnscaledWidth * m_Scale; + m_Height = m_UnscaledHeight * m_Scale; +} + +bool CPostprocManager::ShouldUpscale() const +{ + return m_Width < m_UnscaledWidth; +} + +bool CPostprocManager::ShouldDownscale() const +{ + return m_Width > m_UnscaledWidth; +} Index: ps/trunk/source/renderer/Renderer.cpp =================================================================== --- ps/trunk/source/renderer/Renderer.cpp (revision 28009) +++ ps/trunk/source/renderer/Renderer.cpp (revision 28010) @@ -1,912 +1,914 @@ -/* Copyright (C) 2023 Wildfire Games. +/* Copyright (C) 2024 Wildfire Games. * This file is part of 0 A.D. * * 0 A.D. is free software: you can redistribute it and/or modify * it under the terms of the GNU General Public License as published by * the Free Software Foundation, either version 2 of the License, or * (at your option) any later version. * * 0 A.D. is distributed in the hope that it will be useful, * but WITHOUT ANY WARRANTY; without even the implied warranty of * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the * GNU General Public License for more details. * * You should have received a copy of the GNU General Public License * along with 0 A.D. If not, see . */ #include "precompiled.h" #include "Renderer.h" #include "graphics/Canvas2D.h" #include "graphics/CinemaManager.h" #include "graphics/GameView.h" #include "graphics/LightEnv.h" #include "graphics/ModelDef.h" #include "graphics/TerrainTextureManager.h" #include "i18n/L10n.h" #include "lib/allocators/shared_ptr.h" #include "lib/hash.h" #include "lib/tex/tex.h" #include "gui/GUIManager.h" #include "ps/CConsole.h" #include "ps/CLogger.h" #include "ps/ConfigDB.h" #include "ps/CStrInternStatic.h" #include "ps/Game.h" #include "ps/GameSetup/Config.h" #include "ps/GameSetup/GameSetup.h" #include "ps/Globals.h" #include "ps/Loader.h" #include "ps/Profile.h" #include "ps/Filesystem.h" #include "ps/World.h" #include "ps/ProfileViewer.h" #include "graphics/Camera.h" #include "graphics/FontManager.h" #include "graphics/ShaderManager.h" #include "graphics/Terrain.h" #include "graphics/Texture.h" #include "graphics/TextureManager.h" #include "ps/Util.h" #include "ps/VideoMode.h" #include "renderer/backend/IDevice.h" #include "renderer/DebugRenderer.h" #include "renderer/PostprocManager.h" #include "renderer/RenderingOptions.h" #include "renderer/RenderModifiers.h" #include "renderer/SceneRenderer.h" #include "renderer/TimeManager.h" #include "renderer/VertexBufferManager.h" #include "tools/atlas/GameInterface/GameLoop.h" #include "tools/atlas/GameInterface/View.h" #include namespace { size_t g_NextScreenShotNumber = 0; /////////////////////////////////////////////////////////////////////////////////// // CRendererStatsTable - Profile display of rendering stats /** * Class CRendererStatsTable: Implementation of AbstractProfileTable to * display the renderer stats in-game. * * Accesses CRenderer::m_Stats by keeping the reference passed to the * constructor. */ class CRendererStatsTable : public AbstractProfileTable { NONCOPYABLE(CRendererStatsTable); public: CRendererStatsTable(const CRenderer::Stats& st); // Implementation of AbstractProfileTable interface CStr GetName() override; CStr GetTitle() override; size_t GetNumberRows() override; const std::vector& GetColumns() override; CStr GetCellText(size_t row, size_t col) override; AbstractProfileTable* GetChild(size_t row) override; private: /// Reference to the renderer singleton's stats const CRenderer::Stats& Stats; /// Column descriptions std::vector columnDescriptions; enum { Row_DrawCalls = 0, Row_TerrainTris, Row_WaterTris, Row_ModelTris, Row_OverlayTris, Row_BlendSplats, Row_Particles, Row_VBReserved, Row_VBAllocated, Row_TextureMemory, Row_ShadersLoaded, // Must be last to count number of rows NumberRows }; }; // Construction CRendererStatsTable::CRendererStatsTable(const CRenderer::Stats& st) : Stats(st) { columnDescriptions.push_back(ProfileColumn("Name", 230)); columnDescriptions.push_back(ProfileColumn("Value", 100)); } // Implementation of AbstractProfileTable interface CStr CRendererStatsTable::GetName() { return "renderer"; } CStr CRendererStatsTable::GetTitle() { return "Renderer statistics"; } size_t CRendererStatsTable::GetNumberRows() { return NumberRows; } const std::vector& CRendererStatsTable::GetColumns() { return columnDescriptions; } CStr CRendererStatsTable::GetCellText(size_t row, size_t col) { char buf[256]; switch(row) { case Row_DrawCalls: if (col == 0) return "# draw calls"; sprintf_s(buf, sizeof(buf), "%lu", (unsigned long)Stats.m_DrawCalls); return buf; case Row_TerrainTris: if (col == 0) return "# terrain tris"; sprintf_s(buf, sizeof(buf), "%lu", (unsigned long)Stats.m_TerrainTris); return buf; case Row_WaterTris: if (col == 0) return "# water tris"; sprintf_s(buf, sizeof(buf), "%lu", (unsigned long)Stats.m_WaterTris); return buf; case Row_ModelTris: if (col == 0) return "# model tris"; sprintf_s(buf, sizeof(buf), "%lu", (unsigned long)Stats.m_ModelTris); return buf; case Row_OverlayTris: if (col == 0) return "# overlay tris"; sprintf_s(buf, sizeof(buf), "%lu", (unsigned long)Stats.m_OverlayTris); return buf; case Row_BlendSplats: if (col == 0) return "# blend splats"; sprintf_s(buf, sizeof(buf), "%lu", (unsigned long)Stats.m_BlendSplats); return buf; case Row_Particles: if (col == 0) return "# particles"; sprintf_s(buf, sizeof(buf), "%lu", (unsigned long)Stats.m_Particles); return buf; case Row_VBReserved: if (col == 0) return "VB reserved"; sprintf_s(buf, sizeof(buf), "%lu kB", static_cast(g_Renderer.GetVertexBufferManager().GetBytesReserved() / 1024)); return buf; case Row_VBAllocated: if (col == 0) return "VB allocated"; sprintf_s(buf, sizeof(buf), "%lu kB", static_cast(g_Renderer.GetVertexBufferManager().GetBytesAllocated() / 1024)); return buf; case Row_TextureMemory: if (col == 0) return "textures uploaded"; sprintf_s(buf, sizeof(buf), "%lu kB", (unsigned long)g_Renderer.GetTextureManager().GetBytesUploaded() / 1024); return buf; case Row_ShadersLoaded: if (col == 0) return "shader effects loaded"; sprintf_s(buf, sizeof(buf), "%lu", (unsigned long)g_Renderer.GetShaderManager().GetNumEffectsLoaded()); return buf; default: return "???"; } } AbstractProfileTable* CRendererStatsTable::GetChild(size_t UNUSED(row)) { return 0; } } // anonymous namespace /////////////////////////////////////////////////////////////////////////////////// // CRenderer implementation /** * Struct CRendererInternals: Truly hide data that is supposed to be hidden * in this structure so it won't even appear in header files. */ class CRenderer::Internals { NONCOPYABLE(Internals); public: Renderer::Backend::IDevice* device; std::unique_ptr deviceCommandContext; /// true if CRenderer::Open has been called bool IsOpen; /// true if shaders need to be reloaded bool ShadersDirty; /// Table to display renderer stats in-game via profile system CRendererStatsTable profileTable; /// Shader manager CShaderManager shaderManager; /// Texture manager CTextureManager textureManager; CVertexBufferManager vertexBufferManager; /// Time manager CTimeManager timeManager; /// Postprocessing effect manager CPostprocManager postprocManager; CSceneRenderer sceneRenderer; CDebugRenderer debugRenderer; CFontManager fontManager; struct VertexAttributesHash { size_t operator()(const std::vector& attributes) const; }; std::unordered_map< std::vector, std::unique_ptr, VertexAttributesHash> vertexInputLayouts; Internals(Renderer::Backend::IDevice* device) : device(device), deviceCommandContext(device->CreateCommandContext()), IsOpen(false), ShadersDirty(true), profileTable(g_Renderer.m_Stats), shaderManager(device), textureManager(g_VFS, false, device), vertexBufferManager(device), postprocManager(device), sceneRenderer(device) { } }; size_t CRenderer::Internals::VertexAttributesHash::operator()( const std::vector& attributes) const { size_t seed = 0; hash_combine(seed, attributes.size()); for (const Renderer::Backend::SVertexAttributeFormat& attribute : attributes) { hash_combine(seed, attribute.stream); hash_combine(seed, attribute.format); hash_combine(seed, attribute.offset); hash_combine(seed, attribute.stride); hash_combine(seed, attribute.rate); hash_combine(seed, attribute.bindingSlot); } return seed; } CRenderer::CRenderer(Renderer::Backend::IDevice* device) { TIMER(L"InitRenderer"); m = std::make_unique(device); g_ProfileViewer.AddRootTable(&m->profileTable); m_Width = 0; m_Height = 0; m_Stats.Reset(); // Create terrain related stuff. new CTerrainTextureManager(device); Open(g_xres, g_yres); // Setup lighting environment. Since the Renderer accesses the // lighting environment through a pointer, this has to be done before // the first Frame. GetSceneRenderer().SetLightEnv(&g_LightEnv); ModelDefActivateFastImpl(); ColorActivateFastImpl(); ModelRenderer::Init(); } CRenderer::~CRenderer() { delete &g_TexMan; // We no longer UnloadWaterTextures here - // that is the responsibility of the module that asked for // them to be loaded (i.e. CGameView). m.reset(); } void CRenderer::ReloadShaders() { ENSURE(m->IsOpen); m->sceneRenderer.ReloadShaders(m->device); m->ShadersDirty = false; } bool CRenderer::Open(int width, int height) { m->IsOpen = true; // Dimensions m_Width = width; m_Height = height; // Validate the currently selected render path SetRenderPath(g_RenderingOptions.GetRenderPath()); m->debugRenderer.Initialize(); if (m->postprocManager.IsEnabled()) m->postprocManager.Initialize(); m->sceneRenderer.Initialize(); return true; } void CRenderer::Resize(int width, int height) { m_Width = width; m_Height = height; m->postprocManager.Resize(); m->sceneRenderer.Resize(width, height); } void CRenderer::SetRenderPath(RenderPath rp) { if (!m->IsOpen) { // Delay until Open() is called. return; } // Renderer has been opened, so validate the selected renderpath const bool hasShadersSupport = m->device->GetCapabilities().ARBShaders || m->device->GetBackend() != Renderer::Backend::Backend::GL_ARB; if (rp == RenderPath::DEFAULT) { if (hasShadersSupport) rp = RenderPath::SHADER; else rp = RenderPath::FIXED; } if (rp == RenderPath::SHADER) { if (!hasShadersSupport) { LOGWARNING("Falling back to fixed function\n"); rp = RenderPath::FIXED; } } // TODO: remove this once capabilities have been properly extracted and the above checks have been moved elsewhere. g_RenderingOptions.m_RenderPath = rp; MakeShadersDirty(); } bool CRenderer::ShouldRender() const { return !g_app_minimized && (g_app_has_focus || !g_VideoMode.IsInFullscreen()); } void CRenderer::RenderFrame(const bool needsPresent) { // Do not render if not focused while in fullscreen or minimised, // as that triggers a difficult-to-reproduce crash on some graphic cards. if (!ShouldRender()) return; if (m_ScreenShotType == ScreenShotType::BIG) { RenderBigScreenShot(needsPresent); } else if (m_ScreenShotType == ScreenShotType::DEFAULT) { RenderScreenShot(needsPresent); } else { if (needsPresent) { // In case of no acquired backbuffer we have nothing render to. if (!m->device->AcquireNextBackbuffer()) return; } if (m_ShouldPreloadResourcesBeforeNextFrame) { m_ShouldPreloadResourcesBeforeNextFrame = false; // We don't need to render logger for the preload. RenderFrameImpl(true, false); } RenderFrameImpl(true, true); m->deviceCommandContext->Flush(); if (needsPresent) m->device->Present(); } } void CRenderer::RenderFrameImpl(const bool renderGUI, const bool renderLogger) { PROFILE3("render"); g_Profiler2.RecordGPUFrameStart(); g_TexMan.UploadResourcesIfNeeded(m->deviceCommandContext.get()); m->textureManager.MakeUploadProgress(m->deviceCommandContext.get()); // prepare before starting the renderer frame if (g_Game && g_Game->IsGameStarted()) g_Game->GetView()->BeginFrame(); if (g_Game) m->sceneRenderer.SetSimulation(g_Game->GetSimulation2()); // start new frame BeginFrame(); if (g_Game && g_Game->IsGameStarted()) { g_Game->GetView()->Prepare(m->deviceCommandContext.get()); Renderer::Backend::IFramebuffer* framebuffer = nullptr; + Renderer::Backend::IDeviceCommandContext::Rect viewportRect{}; CPostprocManager& postprocManager = GetPostprocManager(); if (postprocManager.IsEnabled()) { // We have to update the post process manager with real near/far planes // that we use for the scene rendering. postprocManager.SetDepthBufferClipPlanes( m->sceneRenderer.GetViewCamera().GetNearPlane(), m->sceneRenderer.GetViewCamera().GetFarPlane() ); postprocManager.Initialize(); framebuffer = postprocManager.PrepareAndGetOutputFramebuffer(); + viewportRect.width = framebuffer->GetWidth(); + viewportRect.height = framebuffer->GetHeight(); } else { // We don't need to clear the color attachment of the framebuffer as the sky // is going to be rendered anyway. framebuffer = m->deviceCommandContext->GetDevice()->GetCurrentBackbuffer( Renderer::Backend::AttachmentLoadOp::DONT_CARE, Renderer::Backend::AttachmentStoreOp::STORE, Renderer::Backend::AttachmentLoadOp::CLEAR, Renderer::Backend::AttachmentStoreOp::DONT_CARE); + + viewportRect.width = m_Width; + viewportRect.height = m_Height; } m->deviceCommandContext->BeginFramebufferPass(framebuffer); - - Renderer::Backend::IDeviceCommandContext::Rect viewportRect{}; - viewportRect.width = m_Width; - viewportRect.height = m_Height; m->deviceCommandContext->SetViewports(1, &viewportRect); g_Game->GetView()->Render(m->deviceCommandContext.get()); if (postprocManager.IsEnabled()) { m->deviceCommandContext->EndFramebufferPass(); if (postprocManager.IsMultisampleEnabled()) postprocManager.ResolveMultisampleFramebuffer(m->deviceCommandContext.get()); postprocManager.ApplyPostproc(m->deviceCommandContext.get()); Renderer::Backend::IFramebuffer* backbuffer = m->deviceCommandContext->GetDevice()->GetCurrentBackbuffer( Renderer::Backend::AttachmentLoadOp::LOAD, Renderer::Backend::AttachmentStoreOp::STORE, Renderer::Backend::AttachmentLoadOp::LOAD, Renderer::Backend::AttachmentStoreOp::DONT_CARE); postprocManager.BlitOutputFramebuffer( m->deviceCommandContext.get(), backbuffer); m->deviceCommandContext->BeginFramebufferPass(backbuffer); Renderer::Backend::IDeviceCommandContext::Rect viewportRect{}; viewportRect.width = m_Width; viewportRect.height = m_Height; m->deviceCommandContext->SetViewports(1, &viewportRect); } g_Game->GetView()->RenderOverlays(m->deviceCommandContext.get()); g_Game->GetView()->GetCinema()->Render(); } else { // We have a fullscreen background in our UI so we don't need // to clear the color attachment. // We don't need a depth test to render so we don't care about the // depth-stencil attachment content. // In case of Atlas we don't have g_Game, so we still need to clear depth. const Renderer::Backend::AttachmentLoadOp depthStencilLoadOp = g_AtlasGameLoop && g_AtlasGameLoop->view ? Renderer::Backend::AttachmentLoadOp::CLEAR : Renderer::Backend::AttachmentLoadOp::DONT_CARE; Renderer::Backend::IFramebuffer* backbuffer = m->deviceCommandContext->GetDevice()->GetCurrentBackbuffer( Renderer::Backend::AttachmentLoadOp::DONT_CARE, Renderer::Backend::AttachmentStoreOp::STORE, depthStencilLoadOp, Renderer::Backend::AttachmentStoreOp::DONT_CARE); m->deviceCommandContext->BeginFramebufferPass(backbuffer); Renderer::Backend::IDeviceCommandContext::Rect viewportRect{}; viewportRect.width = m_Width; viewportRect.height = m_Height; m->deviceCommandContext->SetViewports(1, &viewportRect); } // If we're in Atlas game view, render special tools if (g_AtlasGameLoop && g_AtlasGameLoop->view) { g_AtlasGameLoop->view->DrawCinemaPathTool(); } RenderFrame2D(renderGUI, renderLogger); m->deviceCommandContext->EndFramebufferPass(); EndFrame(); const Stats& stats = GetStats(); PROFILE2_ATTR("draw calls: %zu", stats.m_DrawCalls); PROFILE2_ATTR("terrain tris: %zu", stats.m_TerrainTris); PROFILE2_ATTR("water tris: %zu", stats.m_WaterTris); PROFILE2_ATTR("model tris: %zu", stats.m_ModelTris); PROFILE2_ATTR("overlay tris: %zu", stats.m_OverlayTris); PROFILE2_ATTR("blend splats: %zu", stats.m_BlendSplats); PROFILE2_ATTR("particles: %zu", stats.m_Particles); g_Profiler2.RecordGPUFrameEnd(); } void CRenderer::RenderFrame2D(const bool renderGUI, const bool renderLogger) { CCanvas2D canvas(g_xres, g_yres, g_VideoMode.GetScale(), m->deviceCommandContext.get()); m->sceneRenderer.RenderTextOverlays(canvas); if (renderGUI) { GPU_SCOPED_LABEL(m->deviceCommandContext.get(), "Render GUI"); // All GUI elements are drawn in Z order to render semi-transparent // objects correctly. g_GUI->Draw(canvas); } // If we're in Atlas game view, render special overlays (e.g. editor bandbox). if (g_AtlasGameLoop && g_AtlasGameLoop->view) { g_AtlasGameLoop->view->DrawOverlays(canvas); } { GPU_SCOPED_LABEL(m->deviceCommandContext.get(), "Render console"); g_Console->Render(canvas); } if (renderLogger) { GPU_SCOPED_LABEL(m->deviceCommandContext.get(), "Render logger"); g_Logger->Render(canvas); } { GPU_SCOPED_LABEL(m->deviceCommandContext.get(), "Render profiler"); // Profile information g_ProfileViewer.RenderProfile(canvas); } } void CRenderer::RenderScreenShot(const bool needsPresent) { m_ScreenShotType = ScreenShotType::NONE; // get next available numbered filename // note: %04d -> always 4 digits, so sorting by filename works correctly. const VfsPath filenameFormat(L"screenshots/screenshot%04d.png"); VfsPath filename; vfs::NextNumberedFilename(g_VFS, filenameFormat, g_NextScreenShotNumber, filename); const size_t width = static_cast(g_xres), height = static_cast(g_yres); const size_t bpp = 24; if (needsPresent && !m->device->AcquireNextBackbuffer()) return; // Hide log messages and re-render RenderFrameImpl(true, false); const size_t img_size = width * height * bpp / 8; const size_t hdr_size = tex_hdr_size(filename); std::shared_ptr buf; AllocateAligned(buf, hdr_size + img_size, maxSectorSize); void* img = buf.get() + hdr_size; Tex t; if (t.wrap(width, height, bpp, TEX_BOTTOM_UP, buf, hdr_size) < 0) return; m->deviceCommandContext->ReadbackFramebufferSync(0, 0, width, height, img); m->deviceCommandContext->Flush(); if (needsPresent) m->device->Present(); if (tex_write(&t, filename) == INFO::OK) { OsPath realPath; g_VFS->GetRealPath(filename, realPath); LOGMESSAGERENDER(g_L10n.Translate("Screenshot written to '%s'"), realPath.string8()); debug_printf( CStr(g_L10n.Translate("Screenshot written to '%s'") + "\n").c_str(), realPath.string8().c_str()); } else LOGERROR("Error writing screenshot to '%s'", filename.string8()); } void CRenderer::RenderBigScreenShot(const bool needsPresent) { m_ScreenShotType = ScreenShotType::NONE; // If the game hasn't started yet then use WriteScreenshot to generate the image. if (!g_Game) return RenderScreenShot(needsPresent); int tiles = 4, tileWidth = 256, tileHeight = 256; CFG_GET_VAL("screenshot.tiles", tiles); CFG_GET_VAL("screenshot.tilewidth", tileWidth); CFG_GET_VAL("screenshot.tileheight", tileHeight); if (tiles <= 0 || tileWidth <= 0 || tileHeight <= 0 || tileWidth * tiles % 4 != 0 || tileHeight * tiles % 4 != 0) { LOGWARNING("Invalid big screenshot size: tiles=%d tileWidth=%d tileHeight=%d", tiles, tileWidth, tileHeight); return; } if (g_xres < tileWidth && g_yres < tileHeight) { LOGWARNING( "The window size is too small for a big screenshot, increase the" " window size %dx%d or decrease the tile size %dx%d", g_xres, g_yres, tileWidth, tileHeight); return; } // get next available numbered filename // note: %04d -> always 4 digits, so sorting by filename works correctly. const VfsPath filenameFormat(L"screenshots/screenshot%04d.bmp"); VfsPath filename; vfs::NextNumberedFilename(g_VFS, filenameFormat, g_NextScreenShotNumber, filename); const int imageWidth = tileWidth * tiles, imageHeight = tileHeight * tiles; const int bpp = 24; const size_t imageSize = imageWidth * imageHeight * bpp / 8; const size_t tileSize = tileWidth * tileHeight * bpp / 8; const size_t headerSize = tex_hdr_size(filename); void* tileData = malloc(tileSize); if (!tileData) { WARN_IF_ERR(ERR::NO_MEM); return; } std::shared_ptr imageBuffer; AllocateAligned(imageBuffer, headerSize + imageSize, maxSectorSize); Tex t; void* img = imageBuffer.get() + headerSize; if (t.wrap(imageWidth, imageHeight, bpp, TEX_BOTTOM_UP, imageBuffer, headerSize) < 0) { free(tileData); return; } CCamera oldCamera = *g_Game->GetView()->GetCamera(); // Resize various things so that the sizes and aspect ratios are correct { g_Renderer.Resize(tileWidth, tileHeight); SViewPort vp = { 0, 0, tileWidth, tileHeight }; g_Game->GetView()->SetViewport(vp); } // Render each tile CMatrix3D projection; projection.SetIdentity(); const float aspectRatio = 1.0f * tileWidth / tileHeight; for (int tileY = 0; tileY < tiles; ++tileY) { for (int tileX = 0; tileX < tiles; ++tileX) { // Adjust the camera to render the appropriate region if (oldCamera.GetProjectionType() == CCamera::ProjectionType::PERSPECTIVE) { projection.SetPerspectiveTile( oldCamera.GetFOV(), aspectRatio, oldCamera.GetNearPlane(), oldCamera.GetFarPlane(), tiles, tileX, tileY); } g_Game->GetView()->GetCamera()->SetProjection(projection); if (!needsPresent || m->device->AcquireNextBackbuffer()) { RenderFrameImpl(false, false); m->deviceCommandContext->ReadbackFramebufferSync(0, 0, tileWidth, tileHeight, tileData); m->deviceCommandContext->Flush(); if (needsPresent) m->device->Present(); } // Copy the tile pixels into the main image for (int y = 0; y < tileHeight; ++y) { void* dest = static_cast(img) + ((tileY * tileHeight + y) * imageWidth + (tileX * tileWidth)) * bpp / 8; void* src = static_cast(tileData) + y * tileWidth * bpp / 8; memcpy(dest, src, tileWidth * bpp / 8); } } } // Restore the viewport settings { g_Renderer.Resize(g_xres, g_yres); SViewPort vp = { 0, 0, g_xres, g_yres }; g_Game->GetView()->SetViewport(vp); g_Game->GetView()->GetCamera()->SetProjectionFromCamera(oldCamera); } if (tex_write(&t, filename) == INFO::OK) { OsPath realPath; g_VFS->GetRealPath(filename, realPath); LOGMESSAGERENDER(g_L10n.Translate("Screenshot written to '%s'"), realPath.string8()); debug_printf( CStr(g_L10n.Translate("Screenshot written to '%s'") + "\n").c_str(), realPath.string8().c_str()); } else LOGERROR("Error writing screenshot to '%s'", filename.string8()); free(tileData); } void CRenderer::BeginFrame() { PROFILE("begin frame"); // Zero out all the per-frame stats. m_Stats.Reset(); if (m->ShadersDirty) ReloadShaders(); m->sceneRenderer.BeginFrame(); } void CRenderer::EndFrame() { PROFILE3("end frame"); m->sceneRenderer.EndFrame(); } void CRenderer::MakeShadersDirty() { m->ShadersDirty = true; m->sceneRenderer.MakeShadersDirty(); } CTextureManager& CRenderer::GetTextureManager() { return m->textureManager; } CVertexBufferManager& CRenderer::GetVertexBufferManager() { return m->vertexBufferManager; } CShaderManager& CRenderer::GetShaderManager() { return m->shaderManager; } CTimeManager& CRenderer::GetTimeManager() { return m->timeManager; } CPostprocManager& CRenderer::GetPostprocManager() { return m->postprocManager; } CSceneRenderer& CRenderer::GetSceneRenderer() { return m->sceneRenderer; } CDebugRenderer& CRenderer::GetDebugRenderer() { return m->debugRenderer; } CFontManager& CRenderer::GetFontManager() { return m->fontManager; } void CRenderer::PreloadResourcesBeforeNextFrame() { m_ShouldPreloadResourcesBeforeNextFrame = true; } void CRenderer::MakeScreenShotOnNextFrame(ScreenShotType screenShotType) { m_ScreenShotType = screenShotType; } Renderer::Backend::IDeviceCommandContext* CRenderer::GetDeviceCommandContext() { return m->deviceCommandContext.get(); } Renderer::Backend::IVertexInputLayout* CRenderer::GetVertexInputLayout( const PS::span attributes) { const auto [it, inserted] = m->vertexInputLayouts.emplace( std::vector{attributes.begin(), attributes.end()}, nullptr); if (inserted) it->second = m->device->CreateVertexInputLayout(attributes); return it->second.get(); } Index: ps/trunk/source/renderer/backend/IDeviceCommandContext.h =================================================================== --- ps/trunk/source/renderer/backend/IDeviceCommandContext.h (revision 28009) +++ ps/trunk/source/renderer/backend/IDeviceCommandContext.h (revision 28010) @@ -1,214 +1,247 @@ -/* Copyright (C) 2023 Wildfire Games. +/* Copyright (C) 2024 Wildfire Games. * This file is part of 0 A.D. * * 0 A.D. is free software: you can redistribute it and/or modify * it under the terms of the GNU General Public License as published by * the Free Software Foundation, either version 2 of the License, or * (at your option) any later version. * * 0 A.D. is distributed in the hope that it will be useful, * but WITHOUT ANY WARRANTY; without even the implied warranty of * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the * GNU General Public License for more details. * * You should have received a copy of the GNU General Public License * along with 0 A.D. If not, see . */ #ifndef INCLUDED_RENDERER_BACKEND_IDEVICECOMMANDCONTEXT #define INCLUDED_RENDERER_BACKEND_IDEVICECOMMANDCONTEXT #include "ps/containers/Span.h" #include "renderer/backend/Format.h" #include "renderer/backend/IDeviceObject.h" #include "renderer/backend/PipelineState.h" #include "renderer/backend/Sampler.h" #include #include namespace Renderer { namespace Backend { class IBuffer; class IDevice; class IFramebuffer; class ITexture; class IDeviceCommandContext : public IDeviceObject { public: /** * Binds the graphics pipeline state. It should be called only inside a * framebuffer pass and as rarely as possible. */ virtual void SetGraphicsPipelineState(IGraphicsPipelineState* pipelineState) = 0; + /** + * Binds the graphics pipeline state. It should be called only inside a + * framebuffer pass and as rarely as possible. + */ + virtual void SetComputePipelineState(IComputePipelineState* pipelineState) = 0; + // TODO: maybe we should add a more common type, like CRectI. struct Rect { int32_t x, y; int32_t width, height; }; /** * Copies source region into destination region automatically applying * compatible format conversion and scaling using a provided filter. * A backbuffer can't be a source. */ virtual void BlitFramebuffer( IFramebuffer* sourceFramebuffer, IFramebuffer* destinationFramebuffer, const Rect& sourceRegion, const Rect& destinationRegion, const Sampler::Filter filter) = 0; /** * Resolves multisample source framebuffer attachments to destination * attachments. Source attachments should have a sample count > 1 and * destination attachments should have a sample count = 1. * A backbuffer can't be a source. */ virtual void ResolveFramebuffer( IFramebuffer* sourceFramebuffer, IFramebuffer* destinationFramebuffer) = 0; /** * Starts a framebuffer pass, performs attachment load operations. * It should be called as rarely as possible. * * @see IFramebuffer */ virtual void BeginFramebufferPass(IFramebuffer* framebuffer) = 0; /** * Finishes a framebuffer pass, performs attachment store operations. */ virtual void EndFramebufferPass() = 0; /** * Clears all mentioned attachments. Prefer to use attachment load operations over * this function. It should be called only inside a framebuffer pass. */ virtual void ClearFramebuffer(const bool color, const bool depth, const bool stencil) = 0; /** * Readbacks the current backbuffer to data in R8G8B8_UNORM format somewhen * between the function call and Flush (inclusively). Because of that the * data pointer should be valid in that time period and have enough space * to fit the readback result. * @note this operation is very slow and should not be used regularly. * TODO: ideally we should do readback on Present or even asynchronously * but a client doesn't support that yet. */ virtual void ReadbackFramebufferSync( const uint32_t x, const uint32_t y, const uint32_t width, const uint32_t height, void* data) = 0; virtual void UploadTexture(ITexture* texture, const Format dataFormat, const void* data, const size_t dataSize, const uint32_t level = 0, const uint32_t layer = 0) = 0; virtual void UploadTextureRegion(ITexture* texture, const Format dataFormat, const void* data, const size_t dataSize, const uint32_t xOffset, const uint32_t yOffset, const uint32_t width, const uint32_t height, const uint32_t level = 0, const uint32_t layer = 0) = 0; using UploadBufferFunction = std::function; virtual void UploadBuffer(IBuffer* buffer, const void* data, const uint32_t dataSize) = 0; virtual void UploadBuffer(IBuffer* buffer, const UploadBufferFunction& uploadFunction) = 0; virtual void UploadBufferRegion( IBuffer* buffer, const void* data, const uint32_t dataOffset, const uint32_t dataSize) = 0; virtual void UploadBufferRegion( IBuffer* buffer, const uint32_t dataOffset, const uint32_t dataSize, const UploadBufferFunction& uploadFunction) = 0; virtual void SetScissors(const uint32_t scissorCount, const Rect* scissors) = 0; virtual void SetViewports(const uint32_t viewportCount, const Rect* viewports) = 0; /** * Binds the vertex input layout. It should be compatible with the shader * program's one. It should be called only inside a framebuffer pass and as * rarely as possible. */ virtual void SetVertexInputLayout( IVertexInputLayout* vertexInputLayout) = 0; virtual void SetVertexBuffer( const uint32_t bindingSlot, IBuffer* buffer, const uint32_t offset) = 0; virtual void SetVertexBufferData( const uint32_t bindingSlot, const void* data, const uint32_t dataSize) = 0; virtual void SetIndexBuffer(IBuffer* buffer) = 0; virtual void SetIndexBufferData(const void* data, const uint32_t dataSize) = 0; virtual void BeginPass() = 0; virtual void EndPass() = 0; virtual void Draw(const uint32_t firstVertex, const uint32_t vertexCount) = 0; virtual void DrawIndexed( const uint32_t firstIndex, const uint32_t indexCount, const int32_t vertexOffset) = 0; virtual void DrawInstanced( const uint32_t firstVertex, const uint32_t vertexCount, const uint32_t firstInstance, const uint32_t instanceCount) = 0; virtual void DrawIndexedInstanced( const uint32_t firstIndex, const uint32_t indexCount, const uint32_t firstInstance, const uint32_t instanceCount, const int32_t vertexOffset) = 0; // TODO: should be removed when performance impact is minimal on slow hardware. virtual void DrawIndexedInRange( const uint32_t firstIndex, const uint32_t indexCount, const uint32_t start, const uint32_t end) = 0; + /** + * Starts a compute pass, can't be called inside a framebuffer pass. + * It should be called as rarely as possible. + */ + virtual void BeginComputePass() = 0; + + /** + * Finishes a compute pass. + */ + virtual void EndComputePass() = 0; + + /** + * Dispatches groupCountX * groupCountY * groupCountZ compute groups. + */ + virtual void Dispatch( + const uint32_t groupCountX, + const uint32_t groupCountY, + const uint32_t groupCountZ) = 0; + + /** + * Sets a read-only texture to the binding slot. + */ virtual void SetTexture(const int32_t bindingSlot, ITexture* texture) = 0; + /** + * Sets a read & write resource to the binding slot. + */ + virtual void SetStorageTexture(const int32_t bindingSlot, ITexture* texture) = 0; + virtual void SetUniform( const int32_t bindingSlot, const float value) = 0; virtual void SetUniform( const int32_t bindingSlot, const float valueX, const float valueY) = 0; virtual void SetUniform( const int32_t bindingSlot, const float valueX, const float valueY, const float valueZ) = 0; virtual void SetUniform( const int32_t bindingSlot, const float valueX, const float valueY, const float valueZ, const float valueW) = 0; virtual void SetUniform( const int32_t bindingSlot, PS::span values) = 0; virtual void BeginScopedLabel(const char* name) = 0; virtual void EndScopedLabel() = 0; virtual void Flush() = 0; }; } // namespace Backend } // namespace Renderer #define GPU_SCOPED_LABEL(deviceCommandContext, name) \ GPUScopedLabel scopedLabel((deviceCommandContext), (name)); class GPUScopedLabel { public: GPUScopedLabel( Renderer::Backend::IDeviceCommandContext* deviceCommandContext, const char* name) : m_DeviceCommandContext(deviceCommandContext) { m_DeviceCommandContext->BeginScopedLabel(name); } ~GPUScopedLabel() { m_DeviceCommandContext->EndScopedLabel(); } private: Renderer::Backend::IDeviceCommandContext* m_DeviceCommandContext = nullptr; }; #endif // INCLUDED_RENDERER_BACKEND_IDEVICECOMMANDCONTEXT Index: ps/trunk/source/renderer/backend/dummy/Device.cpp =================================================================== --- ps/trunk/source/renderer/backend/dummy/Device.cpp (revision 28009) +++ ps/trunk/source/renderer/backend/dummy/Device.cpp (revision 28010) @@ -1,171 +1,177 @@ -/* Copyright (C) 2023 Wildfire Games. +/* Copyright (C) 2024 Wildfire Games. * This file is part of 0 A.D. * * 0 A.D. is free software: you can redistribute it and/or modify * it under the terms of the GNU General Public License as published by * the Free Software Foundation, either version 2 of the License, or * (at your option) any later version. * * 0 A.D. is distributed in the hope that it will be useful, * but WITHOUT ANY WARRANTY; without even the implied warranty of * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the * GNU General Public License for more details. * * You should have received a copy of the GNU General Public License * along with 0 A.D. If not, see . */ #include "precompiled.h" #include "Device.h" #include "renderer/backend/dummy/Buffer.h" #include "renderer/backend/dummy/DeviceCommandContext.h" #include "renderer/backend/dummy/Framebuffer.h" #include "renderer/backend/dummy/PipelineState.h" #include "renderer/backend/dummy/ShaderProgram.h" #include "renderer/backend/dummy/Texture.h" #include "scriptinterface/JSON.h" #include "scriptinterface/Object.h" #include "scriptinterface/ScriptInterface.h" #include "scriptinterface/ScriptRequest.h" namespace Renderer { namespace Backend { namespace Dummy { CDevice::CDevice() { m_Name = "Dummy"; m_Version = "Unknown"; m_DriverInformation = "Unknown"; m_Extensions = {}; m_Backbuffer = CFramebuffer::Create(this); m_Capabilities.S3TC = true; m_Capabilities.ARBShaders = false; m_Capabilities.ARBShadersShadow = false; m_Capabilities.computeShaders = true; m_Capabilities.debugLabels = true; m_Capabilities.debugScopedLabels = true; m_Capabilities.multisampling = true; m_Capabilities.anisotropicFiltering = true; m_Capabilities.maxSampleCount = 4u; m_Capabilities.maxAnisotropy = 16.0f; m_Capabilities.maxTextureSize = 8192u; m_Capabilities.instancing = true; } CDevice::~CDevice() = default; void CDevice::Report(const ScriptRequest& rq, JS::HandleValue settings) { Script::SetProperty(rq, settings, "name", "dummy"); } std::unique_ptr CDevice::CreateCommandContext() { return CDeviceCommandContext::Create(this); } std::unique_ptr CDevice::CreateGraphicsPipelineState( const SGraphicsPipelineStateDesc& pipelineStateDesc) { return CGraphicsPipelineState::Create(this, pipelineStateDesc); } +std::unique_ptr CDevice::CreateComputePipelineState( + const SComputePipelineStateDesc& pipelineStateDesc) +{ + return CComputePipelineState::Create(this, pipelineStateDesc); +} + std::unique_ptr CDevice::CreateVertexInputLayout( const PS::span UNUSED(attributes)) { return nullptr; } std::unique_ptr CDevice::CreateTexture( const char* UNUSED(name), const CTexture::Type type, const uint32_t usage, const Format format, const uint32_t width, const uint32_t height, const Sampler::Desc& UNUSED(defaultSamplerDesc), const uint32_t MIPLevelCount, const uint32_t UNUSED(sampleCount)) { return CTexture::Create(this, type, usage, format, width, height, MIPLevelCount); } std::unique_ptr CDevice::CreateTexture2D( const char* name, const uint32_t usage, const Format format, const uint32_t width, const uint32_t height, const Sampler::Desc& defaultSamplerDesc, const uint32_t MIPLevelCount, const uint32_t sampleCount) { return CreateTexture(name, ITexture::Type::TEXTURE_2D, usage, format, width, height, defaultSamplerDesc, MIPLevelCount, sampleCount); } std::unique_ptr CDevice::CreateFramebuffer( const char*, SColorAttachment*, SDepthStencilAttachment*) { return CFramebuffer::Create(this); } std::unique_ptr CDevice::CreateBuffer( const char*, const CBuffer::Type type, const uint32_t size, const bool dynamic) { return CBuffer::Create(this, type, size, dynamic); } std::unique_ptr CDevice::CreateShaderProgram( const CStr&, const CShaderDefines&) { return CShaderProgram::Create(this); } bool CDevice::AcquireNextBackbuffer() { // We have nothing to acquire. return true; } IFramebuffer* CDevice::GetCurrentBackbuffer( const AttachmentLoadOp, const AttachmentStoreOp, const AttachmentLoadOp, const AttachmentStoreOp) { return m_Backbuffer.get(); } void CDevice::Present() { // We have nothing to present. } void CDevice::OnWindowResize(const uint32_t UNUSED(width), const uint32_t UNUSED(height)) { } bool CDevice::IsTextureFormatSupported(const Format UNUSED(format)) const { return true; } bool CDevice::IsFramebufferFormatSupported(const Format UNUSED(format)) const { return true; } Format CDevice::GetPreferredDepthStencilFormat( const uint32_t, const bool, const bool) const { return Format::D24_UNORM_S8_UINT; } std::unique_ptr CreateDevice(SDL_Window* UNUSED(window)) { return std::make_unique(); } } // namespace Dummy } // namespace Backend } // namespace Renderer Index: ps/trunk/source/renderer/backend/dummy/DeviceCommandContext.h =================================================================== --- ps/trunk/source/renderer/backend/dummy/DeviceCommandContext.h (revision 28009) +++ ps/trunk/source/renderer/backend/dummy/DeviceCommandContext.h (revision 28010) @@ -1,154 +1,165 @@ -/* Copyright (C) 2023 Wildfire Games. +/* Copyright (C) 2024 Wildfire Games. * This file is part of 0 A.D. * * 0 A.D. is free software: you can redistribute it and/or modify * it under the terms of the GNU General Public License as published by * the Free Software Foundation, either version 2 of the License, or * (at your option) any later version. * * 0 A.D. is distributed in the hope that it will be useful, * but WITHOUT ANY WARRANTY; without even the implied warranty of * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the * GNU General Public License for more details. * * You should have received a copy of the GNU General Public License * along with 0 A.D. If not, see . */ #ifndef INCLUDED_RENDERER_BACKEND_DUMMY_DEVICECOMMANDCONTEXT #define INCLUDED_RENDERER_BACKEND_DUMMY_DEVICECOMMANDCONTEXT #include "renderer/backend/Format.h" #include "renderer/backend/IDeviceCommandContext.h" #include "renderer/backend/PipelineState.h" #include namespace Renderer { namespace Backend { namespace Dummy { class CDevice; class CBuffer; class CFramebuffer; class CShaderProgram; class CTexture; class CDeviceCommandContext : public IDeviceCommandContext { public: ~CDeviceCommandContext(); IDevice* GetDevice() override; void SetGraphicsPipelineState(IGraphicsPipelineState* pipelineState) override; + void SetComputePipelineState(IComputePipelineState* pipelineState) override; void BlitFramebuffer( IFramebuffer* sourceFramebuffer, IFramebuffer* destinationFramebuffer, const Rect& sourceRegion, const Rect& destinationRegion, const Sampler::Filter filter) override; void ResolveFramebuffer( IFramebuffer* sourceFramebuffer, IFramebuffer* destinationFramebuffer) override; void ClearFramebuffer(const bool color, const bool depth, const bool stencil) override; void BeginFramebufferPass(IFramebuffer* framebuffer) override; void EndFramebufferPass() override; void ReadbackFramebufferSync( const uint32_t x, const uint32_t y, const uint32_t width, const uint32_t height, void* data) override; void UploadTexture(ITexture* texture, const Format dataFormat, const void* data, const size_t dataSize, const uint32_t level = 0, const uint32_t layer = 0) override; void UploadTextureRegion(ITexture* texture, const Format dataFormat, const void* data, const size_t dataSize, const uint32_t xOffset, const uint32_t yOffset, const uint32_t width, const uint32_t height, const uint32_t level = 0, const uint32_t layer = 0) override; using UploadBufferFunction = std::function; void UploadBuffer(IBuffer* buffer, const void* data, const uint32_t dataSize) override; void UploadBuffer(IBuffer* buffer, const UploadBufferFunction& uploadFunction) override; void UploadBufferRegion( IBuffer* buffer, const void* data, const uint32_t dataOffset, const uint32_t dataSize) override; void UploadBufferRegion( IBuffer* buffer, const uint32_t dataOffset, const uint32_t dataSize, const UploadBufferFunction& uploadFunction) override; void SetScissors(const uint32_t scissorCount, const Rect* scissors) override; void SetViewports(const uint32_t viewportCount, const Rect* viewports) override; void SetVertexInputLayout( IVertexInputLayout* vertexInputLayout) override; void SetVertexBuffer( const uint32_t bindingSlot, IBuffer* buffer, const uint32_t offset) override; void SetVertexBufferData( const uint32_t bindingSlot, const void* data, const uint32_t dataSize) override; void SetIndexBuffer(IBuffer* buffer) override; void SetIndexBufferData(const void* data, const uint32_t dataSize) override; void BeginPass() override; void EndPass() override; void Draw(const uint32_t firstVertex, const uint32_t vertexCount) override; void DrawIndexed( const uint32_t firstIndex, const uint32_t indexCount, const int32_t vertexOffset) override; void DrawInstanced( const uint32_t firstVertex, const uint32_t vertexCount, const uint32_t firstInstance, const uint32_t instanceCount) override; void DrawIndexedInstanced( const uint32_t firstIndex, const uint32_t indexCount, const uint32_t firstInstance, const uint32_t instanceCount, const int32_t vertexOffset) override; void DrawIndexedInRange( const uint32_t firstIndex, const uint32_t indexCount, const uint32_t start, const uint32_t end) override; + void BeginComputePass() override; + void EndComputePass() override; + + void Dispatch( + const uint32_t groupCountX, + const uint32_t groupCountY, + const uint32_t groupCountZ) override; + void SetTexture(const int32_t bindingSlot, ITexture* texture) override; + void SetStorageTexture(const int32_t bindingSlot, ITexture* texture) override; + void SetUniform( const int32_t bindingSlot, const float value) override; void SetUniform( const int32_t bindingSlot, const float valueX, const float valueY) override; void SetUniform( const int32_t bindingSlot, const float valueX, const float valueY, const float valueZ) override; void SetUniform( const int32_t bindingSlot, const float valueX, const float valueY, const float valueZ, const float valueW) override; void SetUniform( const int32_t bindingSlot, PS::span values) override; void BeginScopedLabel(const char* name) override; void EndScopedLabel() override; void Flush() override; private: friend class CDevice; static std::unique_ptr Create(CDevice* device); CDeviceCommandContext(); CDevice* m_Device = nullptr; }; } // namespace Dummy } // namespace Backend } // namespace Renderer #endif // INCLUDED_RENDERER_BACKEND_DUMMY_DEVICECOMMANDCONTEXT Index: ps/trunk/source/renderer/backend/gl/Device.cpp =================================================================== --- ps/trunk/source/renderer/backend/gl/Device.cpp (revision 28009) +++ ps/trunk/source/renderer/backend/gl/Device.cpp (revision 28010) @@ -1,1074 +1,1080 @@ -/* Copyright (C) 2023 Wildfire Games. +/* Copyright (C) 2024 Wildfire Games. * This file is part of 0 A.D. * * 0 A.D. is free software: you can redistribute it and/or modify * it under the terms of the GNU General Public License as published by * the Free Software Foundation, either version 2 of the License, or * (at your option) any later version. * * 0 A.D. is distributed in the hope that it will be useful, * but WITHOUT ANY WARRANTY; without even the implied warranty of * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the * GNU General Public License for more details. * * You should have received a copy of the GNU General Public License * along with 0 A.D. If not, see . */ #include "precompiled.h" #include "Device.h" #include "lib/external_libraries/libsdl.h" #include "lib/hash.h" #include "lib/ogl.h" #include "ps/CLogger.h" #include "ps/ConfigDB.h" #include "ps/Profile.h" #include "renderer/backend/gl/DeviceCommandContext.h" #include "renderer/backend/gl/PipelineState.h" #include "renderer/backend/gl/Texture.h" #include "scriptinterface/JSON.h" #include "scriptinterface/Object.h" #include "scriptinterface/ScriptInterface.h" #include "scriptinterface/ScriptRequest.h" #if OS_WIN // We can't include wutil directly because GL headers conflict with Windows // until we use a proper GL loader. extern void* wutil_GetAppHDC(); #endif #include #include #include #if !CONFIG2_GLES && (defined(SDL_VIDEO_DRIVER_X11) || defined(SDL_VIDEO_DRIVER_WAYLAND)) #if defined(SDL_VIDEO_DRIVER_X11) #include #endif #if defined(SDL_VIDEO_DRIVER_WAYLAND) #include #endif #include #endif // !CONFIG2_GLES && (defined(SDL_VIDEO_DRIVER_X11) || defined(SDL_VIDEO_DRIVER_WAYLAND)) namespace Renderer { namespace Backend { namespace GL { namespace { std::string GetNameImpl() { // GL_VENDOR+GL_RENDERER are good enough here, so we don't use WMI to detect the cards. // On top of that WMI can cause crashes with Nvidia Optimus and some netbooks // see http://trac.wildfiregames.com/ticket/1952 // http://trac.wildfiregames.com/ticket/1575 char cardName[128]; const char* vendor = reinterpret_cast(glGetString(GL_VENDOR)); const char* renderer = reinterpret_cast(glGetString(GL_RENDERER)); // Happens if called before GL initialization. if (!vendor || !renderer) return {}; sprintf_s(cardName, std::size(cardName), "%s %s", vendor, renderer); // Remove crap from vendor names. (don't dare touch the model name - // it's too risky, there are too many different strings). #define SHORTEN(what, charsToKeep) \ if (!strncmp(cardName, what, std::size(what) - 1)) \ memmove(cardName + charsToKeep, cardName + std::size(what) - 1, (strlen(cardName) - (std::size(what) - 1) + 1) * sizeof(char)); SHORTEN("ATI Technologies Inc.", 3); SHORTEN("NVIDIA Corporation", 6); SHORTEN("S3 Graphics", 2); // returned by EnumDisplayDevices SHORTEN("S3 Graphics, Incorporated", 2); // returned by GL_VENDOR #undef SHORTEN return cardName; } std::string GetVersionImpl() { return reinterpret_cast(glGetString(GL_VERSION)); } std::string GetDriverInformationImpl() { // Usually GL_VERSION contains both OpenGL and driver versions. return reinterpret_cast(glGetString(GL_VERSION)); } std::vector GetExtensionsImpl() { std::vector extensions; const std::string exts = ogl_ExtensionString(); boost::split(extensions, exts, boost::algorithm::is_space(), boost::token_compress_on); std::sort(extensions.begin(), extensions.end()); return extensions; } void GLAD_API_PTR OnDebugMessage( GLenum source, GLenum type, GLuint id, GLenum severity, GLsizei UNUSED(length), const GLchar* message, const void* UNUSED(user_param)) { std::string debugSource = "unknown"; std::string debugType = "unknown"; std::string debugSeverity = "unknown"; switch (source) { case GL_DEBUG_SOURCE_API: debugSource = "the API"; break; case GL_DEBUG_SOURCE_WINDOW_SYSTEM: debugSource = "the window system"; break; case GL_DEBUG_SOURCE_SHADER_COMPILER: debugSource = "the shader compiler"; break; case GL_DEBUG_SOURCE_THIRD_PARTY: debugSource = "a third party"; break; case GL_DEBUG_SOURCE_APPLICATION: debugSource = "the application"; break; case GL_DEBUG_SOURCE_OTHER: debugSource = "somewhere"; break; } switch (type) { case GL_DEBUG_TYPE_ERROR: debugType = "error"; break; case GL_DEBUG_TYPE_DEPRECATED_BEHAVIOR: debugType = "deprecated behaviour"; break; case GL_DEBUG_TYPE_UNDEFINED_BEHAVIOR: debugType = "undefined behaviour"; break; case GL_DEBUG_TYPE_PORTABILITY: debugType = "portability"; break; case GL_DEBUG_TYPE_PERFORMANCE: debugType = "performance"; break; case GL_DEBUG_TYPE_OTHER: debugType = "other"; break; case GL_DEBUG_TYPE_MARKER: debugType = "marker"; break; case GL_DEBUG_TYPE_PUSH_GROUP: debugType = "push group"; break; case GL_DEBUG_TYPE_POP_GROUP: debugType = "pop group"; break; } switch (severity) { case GL_DEBUG_SEVERITY_HIGH: debugSeverity = "high"; break; case GL_DEBUG_SEVERITY_MEDIUM: debugSeverity = "medium"; break; case GL_DEBUG_SEVERITY_LOW: debugSeverity = "low"; break; case GL_DEBUG_SEVERITY_NOTIFICATION: debugSeverity = "notification"; break; } if (severity == GL_DEBUG_SEVERITY_NOTIFICATION) { debug_printf( "OpenGL | %s: %s source: %s id %u: %s\n", debugSeverity.c_str(), debugType.c_str(), debugSource.c_str(), id, message); } else { LOGWARNING( "OpenGL | %s: %s source: %s id %u: %s\n", debugSeverity.c_str(), debugType.c_str(), debugSource.c_str(), id, message); } } } // anonymous namespace // static std::unique_ptr CDevice::Create(SDL_Window* window, const bool arb) { std::unique_ptr device(new CDevice()); if (window) { // According to https://wiki.libsdl.org/SDL_CreateWindow we don't need to // call SDL_GL_LoadLibrary if we have a window with SDL_WINDOW_OPENGL, // because it'll be called internally for the first created window. device->m_Window = window; device->m_Context = SDL_GL_CreateContext(device->m_Window); if (!device->m_Context) { LOGERROR("SDL_GL_CreateContext failed: '%s'", SDL_GetError()); return nullptr; } SDL_GL_GetDrawableSize(window, &device->m_SurfaceDrawableWidth, &device->m_SurfaceDrawableHeight); #if OS_WIN ogl_Init(SDL_GL_GetProcAddress, wutil_GetAppHDC()); #elif (defined(SDL_VIDEO_DRIVER_X11) || defined(SDL_VIDEO_DRIVER_WAYLAND)) && !CONFIG2_GLES SDL_SysWMinfo wminfo; // The info structure must be initialized with the SDL version. SDL_VERSION(&wminfo.version); if (!SDL_GetWindowWMInfo(window, &wminfo)) { LOGERROR("Failed to query SDL WM info: %s", SDL_GetError()); return nullptr; } switch (wminfo.subsystem) { #if defined(SDL_VIDEO_DRIVER_WAYLAND) case SDL_SYSWM_WAYLAND: // TODO: maybe we need to load X11 functions // dynamically as well. ogl_Init(SDL_GL_GetProcAddress, GetWaylandDisplay(device->m_Window), static_cast(wminfo.subsystem)); break; #endif #if defined(SDL_VIDEO_DRIVER_X11) case SDL_SYSWM_X11: ogl_Init(SDL_GL_GetProcAddress, GetX11Display(device->m_Window), static_cast(wminfo.subsystem)); break; #endif default: ogl_Init(SDL_GL_GetProcAddress, nullptr, static_cast(wminfo.subsystem)); break; } #else ogl_Init(SDL_GL_GetProcAddress); #endif } else { #if OS_WIN ogl_Init(SDL_GL_GetProcAddress, wutil_GetAppHDC()); #elif (defined(SDL_VIDEO_DRIVER_X11) || defined(SDL_VIDEO_DRIVER_WAYLAND)) && !CONFIG2_GLES bool initialized = false; // Currently we don't have access to the backend type without // the window. So we use hack to detect X11. #if defined(SDL_VIDEO_DRIVER_X11) Display* display = XOpenDisplay(NULL); if (display) { ogl_Init(SDL_GL_GetProcAddress, display, static_cast(SDL_SYSWM_X11)); initialized = true; } #endif #if defined(SDL_VIDEO_DRIVER_WAYLAND) if (!initialized) { // glad will find default EGLDisplay internally. ogl_Init(SDL_GL_GetProcAddress, nullptr, static_cast(SDL_SYSWM_WAYLAND)); initialized = true; } #endif if (!initialized) { LOGERROR("Can't initialize GL"); return nullptr; } #else ogl_Init(SDL_GL_GetProcAddress); #endif #if OS_WIN || defined(SDL_VIDEO_DRIVER_X11) && !CONFIG2_GLES // Hack to stop things looking very ugly when scrolling in Atlas. ogl_SetVsyncEnabled(true); #endif } // If we don't have GL2.0 then we don't have GLSL in core. if (!arb && !ogl_HaveVersion(2, 0)) return nullptr; if ((ogl_HaveExtensions(0, "GL_ARB_vertex_program", "GL_ARB_fragment_program", nullptr) // ARB && !ogl_HaveVersion(2, 0)) // GLSL || !ogl_HaveExtension("GL_ARB_vertex_buffer_object") // VBO || ogl_HaveExtensions(0, "GL_ARB_multitexture", "GL_EXT_draw_range_elements", nullptr) || (!ogl_HaveExtension("GL_EXT_framebuffer_object") && !ogl_HaveExtension("GL_ARB_framebuffer_object"))) { // It doesn't make sense to continue working here, because we're not // able to display anything. DEBUG_DISPLAY_FATAL_ERROR( L"Your graphics card doesn't appear to be fully compatible with OpenGL shaders." L" The game does not support pre-shader graphics cards." L" You are advised to try installing newer drivers and/or upgrade your graphics card." L" For more information, please see http://www.wildfiregames.com/forum/index.php?showtopic=16734" ); } device->m_ARB = arb; device->m_Name = GetNameImpl(); device->m_Version = GetVersionImpl(); device->m_DriverInformation = GetDriverInformationImpl(); device->m_Extensions = GetExtensionsImpl(); // Set packing parameters for uploading and downloading data. glPixelStorei(GL_PACK_ALIGNMENT, 1); glPixelStorei(GL_UNPACK_ALIGNMENT, 1); glEnable(GL_TEXTURE_2D); // glEnable(GL_TEXTURE_2D) is deprecated and might trigger an error. But we // still support pre 2.0 drivers pretending to support 2.0. ogl_SquelchError(GL_INVALID_ENUM); if (arb) { #if !CONFIG2_GLES glEnable(GL_VERTEX_PROGRAM_ARB); glEnable(GL_FRAGMENT_PROGRAM_ARB); #endif } // Some drivers might invalidate an incorrect surface which leads to artifacts. bool enableFramebufferInvalidating = false; CFG_GET_VAL("renderer.backend.gl.enableframebufferinvalidating", enableFramebufferInvalidating); if (enableFramebufferInvalidating) { #if CONFIG2_GLES device->m_UseFramebufferInvalidating = ogl_HaveExtension("GL_EXT_discard_framebuffer"); #else device->m_UseFramebufferInvalidating = !arb && ogl_HaveExtension("GL_ARB_invalidate_subdata"); #endif } Capabilities& capabilities = device->m_Capabilities; capabilities.ARBShaders = !ogl_HaveExtensions(0, "GL_ARB_vertex_program", "GL_ARB_fragment_program", nullptr); if (capabilities.ARBShaders) capabilities.ARBShadersShadow = ogl_HaveExtension("GL_ARB_fragment_program_shadow"); - capabilities.computeShaders = ogl_HaveVersion(4, 3) || ogl_HaveExtension("GL_ARB_compute_shader"); + capabilities.computeShaders = ogl_HaveVersion(4, 3) || (ogl_HaveVersion(4, 2) && ogl_HaveExtension("GL_ARB_compute_shader") && ogl_HaveExtension("GL_ARB_shader_image_load_store")); #if CONFIG2_GLES // Some GLES implementations have GL_EXT_texture_compression_dxt1 // but that only supports DXT1 so we can't use it. capabilities.S3TC = ogl_HaveExtensions(0, "GL_EXT_texture_compression_s3tc", nullptr) == 0; #else // Note: we don't bother checking for GL_S3_s3tc - it is incompatible // and irrelevant (was never widespread). capabilities.S3TC = ogl_HaveExtensions(0, "GL_ARB_texture_compression", "GL_EXT_texture_compression_s3tc", nullptr) == 0; #endif #if CONFIG2_GLES capabilities.multisampling = false; capabilities.maxSampleCount = 1; #else capabilities.multisampling = ogl_HaveVersion(3, 3) && ogl_HaveExtension("GL_ARB_multisample") && ogl_HaveExtension("GL_ARB_texture_multisample"); if (capabilities.multisampling) { // By default GL_MULTISAMPLE should be enabled, but enable it for buggy drivers. glEnable(GL_MULTISAMPLE); GLint maxSamples = 1; glGetIntegerv(GL_MAX_SAMPLES, &maxSamples); capabilities.maxSampleCount = maxSamples; } #endif capabilities.anisotropicFiltering = ogl_HaveExtension("GL_EXT_texture_filter_anisotropic"); if (capabilities.anisotropicFiltering) { GLfloat maxAnisotropy = 1.0f; glGetFloatv(GL_MAX_TEXTURE_MAX_ANISOTROPY_EXT, &maxAnisotropy); capabilities.maxAnisotropy = maxAnisotropy; } GLint maxTextureSize = 1024; glGetIntegerv(GL_MAX_TEXTURE_SIZE, &maxTextureSize); capabilities.maxTextureSize = maxTextureSize; #if CONFIG2_GLES const bool isDebugInCore = ogl_HaveVersion(3, 2); #else const bool isDebugInCore = ogl_HaveVersion(4, 3); #endif const bool hasDebug = isDebugInCore || ogl_HaveExtension("GL_KHR_debug"); if (hasDebug) { #ifdef NDEBUG bool enableDebugMessages = false; CFG_GET_VAL("renderer.backend.debugmessages", enableDebugMessages); capabilities.debugLabels = false; CFG_GET_VAL("renderer.backend.debuglabels", capabilities.debugLabels); capabilities.debugScopedLabels = false; CFG_GET_VAL("renderer.backend.debugscopedlabels", capabilities.debugScopedLabels); #else const bool enableDebugMessages = true; capabilities.debugLabels = true; capabilities.debugScopedLabels = true; #endif if (enableDebugMessages) { glEnable(GL_DEBUG_OUTPUT); #if !CONFIG2_GLES glEnable(GL_DEBUG_OUTPUT_SYNCHRONOUS); #else #warning GLES without GL_DEBUG_OUTPUT_SYNCHRONOUS might call the callback from different threads which might be unsafe. #endif glDebugMessageCallback(OnDebugMessage, nullptr); // Filter out our own debug group messages const GLuint id = 0x0AD; glDebugMessageControl( GL_DEBUG_SOURCE_APPLICATION, GL_DEBUG_TYPE_PUSH_GROUP, GL_DONT_CARE, 1, &id, GL_FALSE); glDebugMessageControl( GL_DEBUG_SOURCE_APPLICATION, GL_DEBUG_TYPE_POP_GROUP, GL_DONT_CARE, 1, &id, GL_FALSE); } } #if CONFIG2_GLES capabilities.instancing = false; #else capabilities.instancing = !device->m_ARB && (ogl_HaveVersion(3, 3) || (ogl_HaveExtension("GL_ARB_draw_instanced") && ogl_HaveExtension("GL_ARB_instanced_arrays"))); #endif return device; } CDevice::CDevice() = default; CDevice::~CDevice() { if (m_Context) SDL_GL_DeleteContext(m_Context); } void CDevice::Report(const ScriptRequest& rq, JS::HandleValue settings) { const char* errstr = "(error)"; Script::SetProperty(rq, settings, "name", m_ARB ? "glarb" : "gl"); #define INTEGER(id) do { \ GLint i = -1; \ glGetIntegerv(GL_##id, &i); \ if (ogl_SquelchError(GL_INVALID_ENUM)) \ Script::SetProperty(rq, settings, "GL_" #id, errstr); \ else \ Script::SetProperty(rq, settings, "GL_" #id, i); \ } while (false) #define INTEGER2(id) do { \ GLint i[2] = { -1, -1 }; \ glGetIntegerv(GL_##id, i); \ if (ogl_SquelchError(GL_INVALID_ENUM)) { \ Script::SetProperty(rq, settings, "GL_" #id "[0]", errstr); \ Script::SetProperty(rq, settings, "GL_" #id "[1]", errstr); \ } else { \ Script::SetProperty(rq, settings, "GL_" #id "[0]", i[0]); \ Script::SetProperty(rq, settings, "GL_" #id "[1]", i[1]); \ } \ } while (false) #define FLOAT(id) do { \ GLfloat f = std::numeric_limits::quiet_NaN(); \ glGetFloatv(GL_##id, &f); \ if (ogl_SquelchError(GL_INVALID_ENUM)) \ Script::SetProperty(rq, settings, "GL_" #id, errstr); \ else \ Script::SetProperty(rq, settings, "GL_" #id, f); \ } while (false) #define FLOAT2(id) do { \ GLfloat f[2] = { std::numeric_limits::quiet_NaN(), std::numeric_limits::quiet_NaN() }; \ glGetFloatv(GL_##id, f); \ if (ogl_SquelchError(GL_INVALID_ENUM)) { \ Script::SetProperty(rq, settings, "GL_" #id "[0]", errstr); \ Script::SetProperty(rq, settings, "GL_" #id "[1]", errstr); \ } else { \ Script::SetProperty(rq, settings, "GL_" #id "[0]", f[0]); \ Script::SetProperty(rq, settings, "GL_" #id "[1]", f[1]); \ } \ } while (false) #define STRING(id) do { \ const char* c = (const char*)glGetString(GL_##id); \ if (!c) c = ""; \ if (ogl_SquelchError(GL_INVALID_ENUM)) c = errstr; \ Script::SetProperty(rq, settings, "GL_" #id, std::string(c)); \ } while (false) #define QUERY(target, pname) do { \ GLint i = -1; \ glGetQueryivARB(GL_##target, GL_##pname, &i); \ if (ogl_SquelchError(GL_INVALID_ENUM)) \ Script::SetProperty(rq, settings, "GL_" #target ".GL_" #pname, errstr); \ else \ Script::SetProperty(rq, settings, "GL_" #target ".GL_" #pname, i); \ } while (false) #define VERTEXPROGRAM(id) do { \ GLint i = -1; \ glGetProgramivARB(GL_VERTEX_PROGRAM_ARB, GL_##id, &i); \ if (ogl_SquelchError(GL_INVALID_ENUM)) \ Script::SetProperty(rq, settings, "GL_VERTEX_PROGRAM_ARB.GL_" #id, errstr); \ else \ Script::SetProperty(rq, settings, "GL_VERTEX_PROGRAM_ARB.GL_" #id, i); \ } while (false) #define FRAGMENTPROGRAM(id) do { \ GLint i = -1; \ glGetProgramivARB(GL_FRAGMENT_PROGRAM_ARB, GL_##id, &i); \ if (ogl_SquelchError(GL_INVALID_ENUM)) \ Script::SetProperty(rq, settings, "GL_FRAGMENT_PROGRAM_ARB.GL_" #id, errstr); \ else \ Script::SetProperty(rq, settings, "GL_FRAGMENT_PROGRAM_ARB.GL_" #id, i); \ } while (false) #define BOOL(id) INTEGER(id) ogl_WarnIfError(); // Core OpenGL 1.3: // (We don't bother checking extension strings for anything older than 1.3; // it'll just produce harmless warnings) STRING(VERSION); STRING(VENDOR); STRING(RENDERER); STRING(EXTENSIONS); #if !CONFIG2_GLES INTEGER(MAX_CLIP_PLANES); #endif INTEGER(SUBPIXEL_BITS); #if !CONFIG2_GLES INTEGER(MAX_3D_TEXTURE_SIZE); #endif INTEGER(MAX_TEXTURE_SIZE); INTEGER(MAX_CUBE_MAP_TEXTURE_SIZE); INTEGER2(MAX_VIEWPORT_DIMS); #if !CONFIG2_GLES BOOL(RGBA_MODE); BOOL(INDEX_MODE); BOOL(DOUBLEBUFFER); BOOL(STEREO); #endif FLOAT2(ALIASED_POINT_SIZE_RANGE); FLOAT2(ALIASED_LINE_WIDTH_RANGE); #if !CONFIG2_GLES INTEGER(MAX_ELEMENTS_INDICES); INTEGER(MAX_ELEMENTS_VERTICES); INTEGER(MAX_TEXTURE_UNITS); #endif INTEGER(SAMPLE_BUFFERS); INTEGER(SAMPLES); // TODO: compressed texture formats INTEGER(RED_BITS); INTEGER(GREEN_BITS); INTEGER(BLUE_BITS); INTEGER(ALPHA_BITS); #if !CONFIG2_GLES INTEGER(INDEX_BITS); #endif INTEGER(DEPTH_BITS); INTEGER(STENCIL_BITS); #if !CONFIG2_GLES // Core OpenGL 2.0 (treated as extensions): if (ogl_HaveExtension("GL_EXT_texture_lod_bias")) { FLOAT(MAX_TEXTURE_LOD_BIAS_EXT); } if (ogl_HaveExtension("GL_ARB_occlusion_query")) { QUERY(SAMPLES_PASSED, QUERY_COUNTER_BITS); } if (ogl_HaveExtension("GL_ARB_shading_language_100")) { STRING(SHADING_LANGUAGE_VERSION_ARB); } if (ogl_HaveExtension("GL_ARB_vertex_shader")) { INTEGER(MAX_VERTEX_ATTRIBS_ARB); INTEGER(MAX_VERTEX_UNIFORM_COMPONENTS_ARB); INTEGER(MAX_VARYING_FLOATS_ARB); INTEGER(MAX_COMBINED_TEXTURE_IMAGE_UNITS_ARB); INTEGER(MAX_VERTEX_TEXTURE_IMAGE_UNITS_ARB); } if (ogl_HaveExtension("GL_ARB_fragment_shader")) { INTEGER(MAX_FRAGMENT_UNIFORM_COMPONENTS_ARB); } if (ogl_HaveExtension("GL_ARB_vertex_shader") || ogl_HaveExtension("GL_ARB_fragment_shader") || ogl_HaveExtension("GL_ARB_vertex_program") || ogl_HaveExtension("GL_ARB_fragment_program")) { INTEGER(MAX_TEXTURE_IMAGE_UNITS_ARB); INTEGER(MAX_TEXTURE_COORDS_ARB); } if (ogl_HaveExtension("GL_ARB_draw_buffers")) { INTEGER(MAX_DRAW_BUFFERS_ARB); } // Core OpenGL 3.0: if (ogl_HaveExtension("GL_EXT_gpu_shader4")) { INTEGER(MIN_PROGRAM_TEXEL_OFFSET_EXT); // no _EXT version of these in glext.h INTEGER(MAX_PROGRAM_TEXEL_OFFSET_EXT); } if (ogl_HaveExtension("GL_EXT_framebuffer_object")) { INTEGER(MAX_COLOR_ATTACHMENTS_EXT); INTEGER(MAX_RENDERBUFFER_SIZE_EXT); } if (ogl_HaveExtension("GL_EXT_framebuffer_multisample")) { INTEGER(MAX_SAMPLES_EXT); } if (ogl_HaveExtension("GL_EXT_texture_array")) { INTEGER(MAX_ARRAY_TEXTURE_LAYERS_EXT); } if (ogl_HaveExtension("GL_EXT_transform_feedback")) { INTEGER(MAX_TRANSFORM_FEEDBACK_INTERLEAVED_COMPONENTS_EXT); INTEGER(MAX_TRANSFORM_FEEDBACK_SEPARATE_ATTRIBS_EXT); INTEGER(MAX_TRANSFORM_FEEDBACK_SEPARATE_COMPONENTS_EXT); } // Other interesting extensions: if (ogl_HaveExtension("GL_EXT_timer_query") || ogl_HaveExtension("GL_ARB_timer_query")) { QUERY(TIME_ELAPSED, QUERY_COUNTER_BITS); } if (ogl_HaveExtension("GL_ARB_timer_query")) { QUERY(TIMESTAMP, QUERY_COUNTER_BITS); } if (ogl_HaveExtension("GL_EXT_texture_filter_anisotropic")) { FLOAT(MAX_TEXTURE_MAX_ANISOTROPY_EXT); } if (ogl_HaveExtension("GL_ARB_texture_rectangle")) { INTEGER(MAX_RECTANGLE_TEXTURE_SIZE_ARB); } if (m_ARB) { if (ogl_HaveExtension("GL_ARB_vertex_program") || ogl_HaveExtension("GL_ARB_fragment_program")) { INTEGER(MAX_PROGRAM_MATRICES_ARB); INTEGER(MAX_PROGRAM_MATRIX_STACK_DEPTH_ARB); } if (ogl_HaveExtension("GL_ARB_vertex_program")) { VERTEXPROGRAM(MAX_PROGRAM_ENV_PARAMETERS_ARB); VERTEXPROGRAM(MAX_PROGRAM_LOCAL_PARAMETERS_ARB); VERTEXPROGRAM(MAX_PROGRAM_INSTRUCTIONS_ARB); VERTEXPROGRAM(MAX_PROGRAM_TEMPORARIES_ARB); VERTEXPROGRAM(MAX_PROGRAM_PARAMETERS_ARB); VERTEXPROGRAM(MAX_PROGRAM_ATTRIBS_ARB); VERTEXPROGRAM(MAX_PROGRAM_ADDRESS_REGISTERS_ARB); VERTEXPROGRAM(MAX_PROGRAM_NATIVE_INSTRUCTIONS_ARB); VERTEXPROGRAM(MAX_PROGRAM_NATIVE_TEMPORARIES_ARB); VERTEXPROGRAM(MAX_PROGRAM_NATIVE_PARAMETERS_ARB); VERTEXPROGRAM(MAX_PROGRAM_NATIVE_ATTRIBS_ARB); VERTEXPROGRAM(MAX_PROGRAM_NATIVE_ADDRESS_REGISTERS_ARB); if (ogl_HaveExtension("GL_ARB_fragment_program")) { // The spec seems to say these should be supported, but // Mesa complains about them so let's not bother /* VERTEXPROGRAM(MAX_PROGRAM_ALU_INSTRUCTIONS_ARB); VERTEXPROGRAM(MAX_PROGRAM_TEX_INSTRUCTIONS_ARB); VERTEXPROGRAM(MAX_PROGRAM_TEX_INDIRECTIONS_ARB); VERTEXPROGRAM(MAX_PROGRAM_NATIVE_ALU_INSTRUCTIONS_ARB); VERTEXPROGRAM(MAX_PROGRAM_NATIVE_TEX_INSTRUCTIONS_ARB); VERTEXPROGRAM(MAX_PROGRAM_NATIVE_TEX_INDIRECTIONS_ARB); */ } } if (ogl_HaveExtension("GL_ARB_fragment_program")) { FRAGMENTPROGRAM(MAX_PROGRAM_ENV_PARAMETERS_ARB); FRAGMENTPROGRAM(MAX_PROGRAM_LOCAL_PARAMETERS_ARB); FRAGMENTPROGRAM(MAX_PROGRAM_INSTRUCTIONS_ARB); FRAGMENTPROGRAM(MAX_PROGRAM_ALU_INSTRUCTIONS_ARB); FRAGMENTPROGRAM(MAX_PROGRAM_TEX_INSTRUCTIONS_ARB); FRAGMENTPROGRAM(MAX_PROGRAM_TEX_INDIRECTIONS_ARB); FRAGMENTPROGRAM(MAX_PROGRAM_TEMPORARIES_ARB); FRAGMENTPROGRAM(MAX_PROGRAM_PARAMETERS_ARB); FRAGMENTPROGRAM(MAX_PROGRAM_ATTRIBS_ARB); FRAGMENTPROGRAM(MAX_PROGRAM_NATIVE_INSTRUCTIONS_ARB); FRAGMENTPROGRAM(MAX_PROGRAM_NATIVE_ALU_INSTRUCTIONS_ARB); FRAGMENTPROGRAM(MAX_PROGRAM_NATIVE_TEX_INSTRUCTIONS_ARB); FRAGMENTPROGRAM(MAX_PROGRAM_NATIVE_TEX_INDIRECTIONS_ARB); FRAGMENTPROGRAM(MAX_PROGRAM_NATIVE_TEMPORARIES_ARB); FRAGMENTPROGRAM(MAX_PROGRAM_NATIVE_PARAMETERS_ARB); FRAGMENTPROGRAM(MAX_PROGRAM_NATIVE_ATTRIBS_ARB); if (ogl_HaveExtension("GL_ARB_vertex_program")) { // The spec seems to say these should be supported, but // Intel drivers on Windows complain about them so let's not bother /* FRAGMENTPROGRAM(MAX_PROGRAM_ADDRESS_REGISTERS_ARB); FRAGMENTPROGRAM(MAX_PROGRAM_NATIVE_ADDRESS_REGISTERS_ARB); */ } } } if (ogl_HaveExtension("GL_ARB_geometry_shader4")) { INTEGER(MAX_GEOMETRY_TEXTURE_IMAGE_UNITS_ARB); INTEGER(MAX_GEOMETRY_OUTPUT_VERTICES_ARB); INTEGER(MAX_GEOMETRY_TOTAL_OUTPUT_COMPONENTS_ARB); INTEGER(MAX_GEOMETRY_UNIFORM_COMPONENTS_ARB); INTEGER(MAX_GEOMETRY_VARYING_COMPONENTS_ARB); INTEGER(MAX_VERTEX_VARYING_COMPONENTS_ARB); } #else // CONFIG2_GLES // Core OpenGL ES 2.0: STRING(SHADING_LANGUAGE_VERSION); INTEGER(MAX_VERTEX_ATTRIBS); INTEGER(MAX_VERTEX_UNIFORM_VECTORS); INTEGER(MAX_VARYING_VECTORS); INTEGER(MAX_COMBINED_TEXTURE_IMAGE_UNITS); INTEGER(MAX_VERTEX_TEXTURE_IMAGE_UNITS); INTEGER(MAX_FRAGMENT_UNIFORM_VECTORS); INTEGER(MAX_TEXTURE_IMAGE_UNITS); INTEGER(MAX_RENDERBUFFER_SIZE); #endif // CONFIG2_GLES // TODO: Support OpenGL platforms which don't use GLX as well. #if defined(SDL_VIDEO_DRIVER_X11) && !CONFIG2_GLES #define GLXQCR_INTEGER(id) do { \ unsigned int i = UINT_MAX; \ if (glXQueryCurrentRendererIntegerMESA(id, &i)) \ Script::SetProperty(rq, settings, #id, i); \ } while (false) #define GLXQCR_INTEGER2(id) do { \ unsigned int i[2] = { UINT_MAX, UINT_MAX }; \ if (glXQueryCurrentRendererIntegerMESA(id, i)) { \ Script::SetProperty(rq, settings, #id "[0]", i[0]); \ Script::SetProperty(rq, settings, #id "[1]", i[1]); \ } \ } while (false) #define GLXQCR_INTEGER3(id) do { \ unsigned int i[3] = { UINT_MAX, UINT_MAX, UINT_MAX }; \ if (glXQueryCurrentRendererIntegerMESA(id, i)) { \ Script::SetProperty(rq, settings, #id "[0]", i[0]); \ Script::SetProperty(rq, settings, #id "[1]", i[1]); \ Script::SetProperty(rq, settings, #id "[2]", i[2]); \ } \ } while (false) #define GLXQCR_STRING(id) do { \ const char* str = glXQueryCurrentRendererStringMESA(id); \ if (str) \ Script::SetProperty(rq, settings, #id ".string", str); \ } while (false) SDL_SysWMinfo wminfo; SDL_VERSION(&wminfo.version); const int ret = SDL_GetWindowWMInfo(m_Window, &wminfo); if (ret && wminfo.subsystem == SDL_SYSWM_X11) { Display* dpy = wminfo.info.x11.display; int scrnum = DefaultScreen(dpy); const char* glxexts = glXQueryExtensionsString(dpy, scrnum); Script::SetProperty(rq, settings, "GLX_EXTENSIONS", glxexts); if (strstr(glxexts, "GLX_MESA_query_renderer") && glXQueryCurrentRendererIntegerMESA && glXQueryCurrentRendererStringMESA) { GLXQCR_INTEGER(GLX_RENDERER_VENDOR_ID_MESA); GLXQCR_INTEGER(GLX_RENDERER_DEVICE_ID_MESA); GLXQCR_INTEGER3(GLX_RENDERER_VERSION_MESA); GLXQCR_INTEGER(GLX_RENDERER_ACCELERATED_MESA); GLXQCR_INTEGER(GLX_RENDERER_VIDEO_MEMORY_MESA); GLXQCR_INTEGER(GLX_RENDERER_UNIFIED_MEMORY_ARCHITECTURE_MESA); GLXQCR_INTEGER(GLX_RENDERER_PREFERRED_PROFILE_MESA); GLXQCR_INTEGER2(GLX_RENDERER_OPENGL_CORE_PROFILE_VERSION_MESA); GLXQCR_INTEGER2(GLX_RENDERER_OPENGL_COMPATIBILITY_PROFILE_VERSION_MESA); GLXQCR_INTEGER2(GLX_RENDERER_OPENGL_ES_PROFILE_VERSION_MESA); GLXQCR_INTEGER2(GLX_RENDERER_OPENGL_ES2_PROFILE_VERSION_MESA); GLXQCR_STRING(GLX_RENDERER_VENDOR_ID_MESA); GLXQCR_STRING(GLX_RENDERER_DEVICE_ID_MESA); } } #endif // SDL_VIDEO_DRIVER_X11 } std::unique_ptr CDevice::CreateCommandContext() { std::unique_ptr commandContet = CDeviceCommandContext::Create(this); m_ActiveCommandContext = commandContet.get(); return commandContet; } std::unique_ptr CDevice::CreateGraphicsPipelineState( const SGraphicsPipelineStateDesc& pipelineStateDesc) { return CGraphicsPipelineState::Create(this, pipelineStateDesc); } +std::unique_ptr CDevice::CreateComputePipelineState( + const SComputePipelineStateDesc& pipelineStateDesc) +{ + return CComputePipelineState::Create(this, pipelineStateDesc); +} + std::unique_ptr CDevice::CreateVertexInputLayout( const PS::span attributes) { return std::make_unique(this, attributes); } std::unique_ptr CDevice::CreateTexture( const char* name, const ITexture::Type type, const uint32_t usage, const Format format, const uint32_t width, const uint32_t height, const Sampler::Desc& defaultSamplerDesc, const uint32_t MIPLevelCount, const uint32_t sampleCount) { return CTexture::Create(this, name, type, usage, format, width, height, defaultSamplerDesc, MIPLevelCount, sampleCount); } std::unique_ptr CDevice::CreateTexture2D( const char* name, const uint32_t usage, const Format format, const uint32_t width, const uint32_t height, const Sampler::Desc& defaultSamplerDesc, const uint32_t MIPLevelCount, const uint32_t sampleCount) { return CreateTexture(name, CTexture::Type::TEXTURE_2D, usage, format, width, height, defaultSamplerDesc, MIPLevelCount, sampleCount); } std::unique_ptr CDevice::CreateFramebuffer( const char* name, SColorAttachment* colorAttachment, SDepthStencilAttachment* depthStencilAttachment) { return CFramebuffer::Create( this, name, colorAttachment, depthStencilAttachment); } std::unique_ptr CDevice::CreateBuffer( const char* name, const IBuffer::Type type, const uint32_t size, const bool dynamic) { return CBuffer::Create(this, name, type, size, dynamic); } std::unique_ptr CDevice::CreateShaderProgram( const CStr& name, const CShaderDefines& defines) { return CShaderProgram::Create(this, name, defines); } bool CDevice::AcquireNextBackbuffer() { ENSURE(!m_BackbufferAcquired); m_BackbufferAcquired = true; return true; } size_t CDevice::BackbufferKeyHash::operator()(const BackbufferKey& key) const { size_t seed = 0; hash_combine(seed, std::get<0>(key)); hash_combine(seed, std::get<1>(key)); hash_combine(seed, std::get<2>(key)); hash_combine(seed, std::get<3>(key)); return seed; } IFramebuffer* CDevice::GetCurrentBackbuffer( const AttachmentLoadOp colorAttachmentLoadOp, const AttachmentStoreOp colorAttachmentStoreOp, const AttachmentLoadOp depthStencilAttachmentLoadOp, const AttachmentStoreOp depthStencilAttachmentStoreOp) { const BackbufferKey key{ colorAttachmentLoadOp, colorAttachmentStoreOp, depthStencilAttachmentLoadOp, depthStencilAttachmentStoreOp}; auto it = m_Backbuffers.find(key); if (it == m_Backbuffers.end()) { it = m_Backbuffers.emplace(key, CFramebuffer::CreateBackbuffer( this, m_SurfaceDrawableWidth, m_SurfaceDrawableHeight, colorAttachmentLoadOp, colorAttachmentStoreOp, depthStencilAttachmentLoadOp, depthStencilAttachmentStoreOp)).first; } return it->second.get(); } void CDevice::Present() { ENSURE(m_BackbufferAcquired); m_BackbufferAcquired = false; if (m_Window) { PROFILE3("swap buffers"); SDL_GL_SwapWindow(m_Window); ogl_WarnIfError(); } bool checkGLErrorAfterSwap = false; CFG_GET_VAL("gl.checkerrorafterswap", checkGLErrorAfterSwap); #if defined(NDEBUG) if (!checkGLErrorAfterSwap) return; #endif PROFILE3("error check"); // We have to check GL errors after SwapBuffer to avoid possible // synchronizations during rendering. if (GLenum err = glGetError()) ONCE(LOGERROR("GL error %s (0x%04x) occurred", ogl_GetErrorName(err), err)); } void CDevice::OnWindowResize(const uint32_t width, const uint32_t height) { ENSURE(!m_BackbufferAcquired); m_Backbuffers.clear(); m_SurfaceDrawableWidth = width; m_SurfaceDrawableHeight = height; } bool CDevice::IsTextureFormatSupported(const Format format) const { bool supported = false; switch (format) { case Format::UNDEFINED: break; case Format::R8G8B8_UNORM: FALLTHROUGH; case Format::R8G8B8A8_UNORM: FALLTHROUGH; case Format::A8_UNORM: FALLTHROUGH; case Format::L8_UNORM: supported = true; break; case Format::R32_SFLOAT: FALLTHROUGH; case Format::R32G32_SFLOAT: FALLTHROUGH; case Format::R32G32B32_SFLOAT: FALLTHROUGH; case Format::R32G32B32A32_SFLOAT: break; case Format::D16_UNORM: FALLTHROUGH; case Format::D24_UNORM: FALLTHROUGH; case Format::D32_SFLOAT: supported = true; break; case Format::D24_UNORM_S8_UINT: #if !CONFIG2_GLES supported = true; #endif break; case Format::D32_SFLOAT_S8_UINT: break; case Format::BC1_RGB_UNORM: FALLTHROUGH; case Format::BC1_RGBA_UNORM: FALLTHROUGH; case Format::BC2_UNORM: FALLTHROUGH; case Format::BC3_UNORM: supported = m_Capabilities.S3TC; break; default: break; } return supported; } bool CDevice::IsFramebufferFormatSupported(const Format format) const { bool supported = false; switch (format) { case Format::UNDEFINED: break; #if !CONFIG2_GLES case Format::R8_UNORM: supported = ogl_HaveVersion(3, 0); break; #endif case Format::R8G8B8A8_UNORM: supported = true; break; default: break; } return supported; } Format CDevice::GetPreferredDepthStencilFormat( const uint32_t UNUSED(usage), const bool depth, const bool stencil) const { ENSURE(depth || stencil); if (stencil) #if CONFIG2_GLES return Format::UNDEFINED; #else return Format::D24_UNORM_S8_UINT; #endif else return Format::D24_UNORM; } std::unique_ptr CreateDevice(SDL_Window* window, const bool arb) { return GL::CDevice::Create(window, arb); } } // namespace GL } // namespace Backend } // namespace Renderer Index: ps/trunk/source/renderer/backend/gl/DeviceCommandContext.h =================================================================== --- ps/trunk/source/renderer/backend/gl/DeviceCommandContext.h (revision 28009) +++ ps/trunk/source/renderer/backend/gl/DeviceCommandContext.h (revision 28010) @@ -1,240 +1,254 @@ -/* Copyright (C) 2023 Wildfire Games. +/* Copyright (C) 2024 Wildfire Games. * This file is part of 0 A.D. * * 0 A.D. is free software: you can redistribute it and/or modify * it under the terms of the GNU General Public License as published by * the Free Software Foundation, either version 2 of the License, or * (at your option) any later version. * * 0 A.D. is distributed in the hope that it will be useful, * but WITHOUT ANY WARRANTY; without even the implied warranty of * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the * GNU General Public License for more details. * * You should have received a copy of the GNU General Public License * along with 0 A.D. If not, see . */ #ifndef INCLUDED_RENDERER_BACKEND_GL_DEVICECOMMANDCONTEXT #define INCLUDED_RENDERER_BACKEND_GL_DEVICECOMMANDCONTEXT #include "lib/ogl.h" #include "ps/containers/Span.h" #include "renderer/backend/Format.h" #include "renderer/backend/gl/Buffer.h" #include "renderer/backend/IDeviceCommandContext.h" #include "renderer/backend/PipelineState.h" #include #include #include #include #include #include namespace Renderer { namespace Backend { namespace GL { class CDevice; class CFramebuffer; class CShaderProgram; class CTexture; class CDeviceCommandContext final : public IDeviceCommandContext { public: ~CDeviceCommandContext(); IDevice* GetDevice() override; void SetGraphicsPipelineState(const SGraphicsPipelineStateDesc& pipelineState); void SetGraphicsPipelineState(IGraphicsPipelineState* pipelineState) override; + void SetComputePipelineState(IComputePipelineState* pipelineState) override; void BlitFramebuffer( IFramebuffer* sourceFramebuffer, IFramebuffer* destinationFramebuffer, const Rect& sourceRegion, const Rect& destinationRegion, const Sampler::Filter filter) override; void ResolveFramebuffer( IFramebuffer* sourceFramebuffer, IFramebuffer* destinationFramebuffer) override; void BeginFramebufferPass(IFramebuffer* framebuffer) override; void EndFramebufferPass() override; void ClearFramebuffer(const bool color, const bool depth, const bool stencil) override; void ReadbackFramebufferSync( const uint32_t x, const uint32_t y, const uint32_t width, const uint32_t height, void* data) override; void UploadTexture(ITexture* texture, const Format dataFormat, const void* data, const size_t dataSize, const uint32_t level = 0, const uint32_t layer = 0) override; void UploadTextureRegion(ITexture* texture, const Format dataFormat, const void* data, const size_t dataSize, const uint32_t xOffset, const uint32_t yOffset, const uint32_t width, const uint32_t height, const uint32_t level = 0, const uint32_t layer = 0) override; using UploadBufferFunction = std::function; void UploadBuffer(IBuffer* buffer, const void* data, const uint32_t dataSize) override; void UploadBuffer(IBuffer* buffer, const UploadBufferFunction& uploadFunction) override; void UploadBufferRegion( IBuffer* buffer, const void* data, const uint32_t dataOffset, const uint32_t dataSize) override; void UploadBufferRegion( IBuffer* buffer, const uint32_t dataOffset, const uint32_t dataSize, const UploadBufferFunction& uploadFunction) override; void SetScissors(const uint32_t scissorCount, const Rect* scissors) override; void SetViewports(const uint32_t viewportCount, const Rect* viewports) override; void SetVertexInputLayout( IVertexInputLayout* vertexInputLayout) override; void SetVertexBuffer( const uint32_t bindingSlot, IBuffer* buffer, const uint32_t offset) override; void SetVertexBufferData( const uint32_t bindingSlot, const void* data, const uint32_t dataSize) override; void SetIndexBuffer(IBuffer* buffer) override; void SetIndexBufferData(const void* data, const uint32_t dataSize) override; void BeginPass() override; void EndPass() override; void Draw(const uint32_t firstVertex, const uint32_t vertexCount) override; void DrawIndexed( const uint32_t firstIndex, const uint32_t indexCount, const int32_t vertexOffset) override; void DrawInstanced( const uint32_t firstVertex, const uint32_t vertexCount, const uint32_t firstInstance, const uint32_t instanceCount) override; void DrawIndexedInstanced( const uint32_t firstIndex, const uint32_t indexCount, const uint32_t firstInstance, const uint32_t instanceCount, const int32_t vertexOffset) override; void DrawIndexedInRange( const uint32_t firstIndex, const uint32_t indexCount, const uint32_t start, const uint32_t end) override; + void BeginComputePass() override; + void EndComputePass() override; + + void Dispatch( + const uint32_t groupCountX, + const uint32_t groupCountY, + const uint32_t groupCountZ) override; + void SetTexture(const int32_t bindingSlot, ITexture* texture) override; + void SetStorageTexture(const int32_t bindingSlot, ITexture* texture) override; + void SetUniform( const int32_t bindingSlot, const float value) override; void SetUniform( const int32_t bindingSlot, const float valueX, const float valueY) override; void SetUniform( const int32_t bindingSlot, const float valueX, const float valueY, const float valueZ) override; void SetUniform( const int32_t bindingSlot, const float valueX, const float valueY, const float valueZ, const float valueW) override; void SetUniform( const int32_t bindingSlot, PS::span values) override; void BeginScopedLabel(const char* name) override; void EndScopedLabel() override; void Flush() override; // We need to know when to invalidate our texture bind cache. void OnTextureDestroy(CTexture* texture); private: friend class CDevice; friend class CTexture; static std::unique_ptr Create(CDevice* device); CDeviceCommandContext(CDevice* device); void ResetStates(); void SetGraphicsPipelineStateImpl( const SGraphicsPipelineStateDesc& pipelineStateDesc, const bool force); void BindTexture(const uint32_t unit, const GLenum target, const GLuint handle); void BindBuffer(const IBuffer::Type type, CBuffer* buffer); CDevice* m_Device = nullptr; SGraphicsPipelineStateDesc m_GraphicsPipelineStateDesc{}; CFramebuffer* m_Framebuffer = nullptr; CShaderProgram* m_ShaderProgram = nullptr; uint32_t m_ScissorCount = 0; // GL2.1 doesn't support more than 1 scissor. std::array m_Scissors; + SComputePipelineStateDesc m_ComputePipelineStateDesc{}; + uint32_t m_ScopedLabelDepth = 0; CBuffer* m_VertexBuffer = nullptr; CBuffer* m_IndexBuffer = nullptr; const void* m_IndexBufferData = nullptr; bool m_InsideFramebufferPass = false; bool m_InsidePass = false; + bool m_InsideComputePass = false; uint32_t m_ActiveTextureUnit = 0; struct BindUnit { GLenum target; GLuint handle; }; std::array m_BoundTextures; class ScopedBind { public: ScopedBind(CDeviceCommandContext* deviceCommandContext, const GLenum target, const GLuint handle); ~ScopedBind(); private: CDeviceCommandContext* m_DeviceCommandContext = nullptr; BindUnit m_OldBindUnit; uint32_t m_ActiveTextureUnit = 0; }; using BoundBuffer = std::pair; std::array m_BoundBuffers; class ScopedBufferBind { public: ScopedBufferBind( CDeviceCommandContext* deviceCommandContext, CBuffer* buffer); ~ScopedBufferBind(); private: CDeviceCommandContext* m_DeviceCommandContext = nullptr; size_t m_CacheIndex = 0; }; struct VertexAttributeFormat { Format format; uint32_t offset; uint32_t stride; VertexAttributeRate rate; uint32_t bindingSlot; bool active; bool initialized; }; std::array< VertexAttributeFormat, static_cast(VertexAttributeStream::UV7) + 1> m_VertexAttributeFormat; }; } // namespace GL } // namespace Backend } // namespace Renderer #endif // INCLUDED_RENDERER_BACKEND_GL_DEVICECOMMANDCONTEXT Index: ps/trunk/source/renderer/backend/gl/PipelineState.h =================================================================== --- ps/trunk/source/renderer/backend/gl/PipelineState.h (revision 28009) +++ ps/trunk/source/renderer/backend/gl/PipelineState.h (revision 28010) @@ -1,68 +1,92 @@ -/* Copyright (C) 2022 Wildfire Games. +/* Copyright (C) 2024 Wildfire Games. * This file is part of 0 A.D. * * 0 A.D. is free software: you can redistribute it and/or modify * it under the terms of the GNU General Public License as published by * the Free Software Foundation, either version 2 of the License, or * (at your option) any later version. * * 0 A.D. is distributed in the hope that it will be useful, * but WITHOUT ANY WARRANTY; without even the implied warranty of * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the * GNU General Public License for more details. * * You should have received a copy of the GNU General Public License * along with 0 A.D. If not, see . */ #ifndef INCLUDED_RENDERER_BACKEND_GL_PIPELINESTATE #define INCLUDED_RENDERER_BACKEND_GL_PIPELINESTATE #include "lib/ogl.h" #include "renderer/backend/PipelineState.h" #include #include namespace Renderer { namespace Backend { namespace GL { class CDevice; class CGraphicsPipelineState final : public IGraphicsPipelineState { public: ~CGraphicsPipelineState() override = default; IDevice* GetDevice() override; IShaderProgram* GetShaderProgram() const override { return m_Desc.shaderProgram; } const SGraphicsPipelineStateDesc& GetDesc() const { return m_Desc; } private: friend class CDevice; static std::unique_ptr Create( CDevice* device, const SGraphicsPipelineStateDesc& desc); CGraphicsPipelineState() = default; CDevice* m_Device = nullptr; SGraphicsPipelineStateDesc m_Desc{}; }; +class CComputePipelineState final : public IComputePipelineState +{ +public: + ~CComputePipelineState() override = default; + + IDevice* GetDevice() override; + + IShaderProgram* GetShaderProgram() const override { return m_Desc.shaderProgram; } + + const SComputePipelineStateDesc& GetDesc() const { return m_Desc; } + +private: + friend class CDevice; + + static std::unique_ptr Create( + CDevice* device, const SComputePipelineStateDesc& desc); + + CComputePipelineState() = default; + + CDevice* m_Device = nullptr; + + SComputePipelineStateDesc m_Desc{}; +}; + } // namespace GL } // namespace Backend } // namespace Renderer #endif // INCLUDED_RENDERER_BACKEND_GL_PIPELINESTATE Index: ps/trunk/source/renderer/backend/vulkan/Device.h =================================================================== --- ps/trunk/source/renderer/backend/vulkan/Device.h (revision 28009) +++ ps/trunk/source/renderer/backend/vulkan/Device.h (revision 28010) @@ -1,231 +1,234 @@ -/* Copyright (C) 2023 Wildfire Games. +/* Copyright (C) 2024 Wildfire Games. * This file is part of 0 A.D. * * 0 A.D. is free software: you can redistribute it and/or modify * it under the terms of the GNU General Public License as published by * the Free Software Foundation, either version 2 of the License, or * (at your option) any later version. * * 0 A.D. is distributed in the hope that it will be useful, * but WITHOUT ANY WARRANTY; without even the implied warranty of * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the * GNU General Public License for more details. * * You should have received a copy of the GNU General Public License * along with 0 A.D. If not, see . */ #ifndef INCLUDED_RENDERER_BACKEND_VULKAN_DEVICE #define INCLUDED_RENDERER_BACKEND_VULKAN_DEVICE #include "renderer/backend/IDevice.h" #include "renderer/backend/vulkan/DeviceForward.h" #include "renderer/backend/vulkan/DeviceSelection.h" #include "renderer/backend/vulkan/Texture.h" #include "renderer/backend/vulkan/VMA.h" #include "scriptinterface/ScriptForward.h" #include #include #include #include #include #include #include #include typedef struct SDL_Window SDL_Window; namespace Renderer { namespace Backend { namespace Vulkan { static constexpr size_t NUMBER_OF_FRAMES_IN_FLIGHT = 3; class CBuffer; class CDescriptorManager; class CFramebuffer; class CRenderPassManager; class CRingCommandContext; class CSamplerManager; class CSubmitScheduler; class CSwapChain; class CDevice final : public IDevice { public: /** * Creates the Vulkan device. */ static std::unique_ptr Create(SDL_Window* window); ~CDevice() override; Backend GetBackend() const override { return Backend::VULKAN; } const std::string& GetName() const override { return m_Name; } const std::string& GetVersion() const override { return m_Version; } const std::string& GetDriverInformation() const override { return m_DriverInformation; } const std::vector& GetExtensions() const override { return m_Extensions; } void Report(const ScriptRequest& rq, JS::HandleValue settings) override; std::unique_ptr CreateCommandContext() override; std::unique_ptr CreateGraphicsPipelineState( const SGraphicsPipelineStateDesc& pipelineStateDesc) override; + std::unique_ptr CreateComputePipelineState( + const SComputePipelineStateDesc& pipelineStateDesc) override; + std::unique_ptr CreateVertexInputLayout( const PS::span attributes) override; std::unique_ptr CreateTexture( const char* name, const ITexture::Type type, const uint32_t usage, const Format format, const uint32_t width, const uint32_t height, const Sampler::Desc& defaultSamplerDesc, const uint32_t MIPLevelCount, const uint32_t sampleCount) override; std::unique_ptr CreateTexture2D( const char* name, const uint32_t usage, const Format format, const uint32_t width, const uint32_t height, const Sampler::Desc& defaultSamplerDesc, const uint32_t MIPLevelCount = 1, const uint32_t sampleCount = 1) override; std::unique_ptr CreateFramebuffer( const char* name, SColorAttachment* colorAttachment, SDepthStencilAttachment* depthStencilAttachment) override; std::unique_ptr CreateBuffer( const char* name, const IBuffer::Type type, const uint32_t size, const bool dynamic) override; std::unique_ptr CreateCBuffer( const char* name, const IBuffer::Type type, const uint32_t size, const bool dynamic); std::unique_ptr CreateShaderProgram( const CStr& name, const CShaderDefines& defines) override; bool AcquireNextBackbuffer() override; IFramebuffer* GetCurrentBackbuffer( const AttachmentLoadOp colorAttachmentLoadOp, const AttachmentStoreOp colorAttachmentStoreOp, const AttachmentLoadOp depthStencilAttachmentLoadOp, const AttachmentStoreOp depthStencilAttachmentStoreOp) override; void Present() override; void OnWindowResize(const uint32_t width, const uint32_t height) override; bool IsTextureFormatSupported(const Format format) const override; bool IsFramebufferFormatSupported(const Format format) const override; Format GetPreferredDepthStencilFormat( const uint32_t usage, const bool depth, const bool stencil) const override; const Capabilities& GetCapabilities() const override { return m_Capabilities; } VkDevice GetVkDevice() { return m_Device; } VmaAllocator GetVMAAllocator() { return m_VMAAllocator; } void ScheduleObjectToDestroy( VkObjectType type, const void* handle, const VmaAllocation allocation) { ScheduleObjectToDestroy(type, reinterpret_cast(handle), allocation); } void ScheduleObjectToDestroy( VkObjectType type, const uint64_t handle, const VmaAllocation allocation); void ScheduleTextureToDestroy(const DeviceObjectUID uid); void SetObjectName(VkObjectType type, const void* handle, const char* name) { SetObjectName(type, reinterpret_cast(handle), name); } void SetObjectName(VkObjectType type, const uint64_t handle, const char* name); std::unique_ptr CreateRingCommandContext(const size_t size); const SAvailablePhysicalDevice& GetChoosenPhysicalDevice() const { return m_ChoosenDevice; } CRenderPassManager& GetRenderPassManager() { return *m_RenderPassManager; } CSamplerManager& GetSamplerManager() { return *m_SamplerManager; } CDescriptorManager& GetDescriptorManager() { return *m_DescriptorManager; } CTexture* GetCurrentBackbufferTexture(); CTexture* GetOrCreateBackbufferReadbackTexture(); DeviceObjectUID GenerateNextDeviceObjectUID(); private: CDevice(); void RecreateSwapChain(); bool IsSwapChainValid(); void ProcessObjectToDestroyQueue(const bool ignoreFrameID = false); void ProcessTextureToDestroyQueue(const bool ignoreFrameID = false); bool IsFormatSupportedForUsage(const Format format, const uint32_t usage) const; std::string m_Name; std::string m_Version; std::string m_VendorID; std::string m_DriverInformation; std::vector m_Extensions; std::vector m_InstanceExtensions; std::vector m_ValidationLayers; SAvailablePhysicalDevice m_ChoosenDevice{}; std::vector m_AvailablePhysicalDevices; Capabilities m_Capabilities{}; VkInstance m_Instance = VK_NULL_HANDLE; VkDebugUtilsMessengerEXT m_DebugMessenger = VK_NULL_HANDLE; SDL_Window* m_Window = nullptr; VkSurfaceKHR m_Surface = VK_NULL_HANDLE; VkDevice m_Device = VK_NULL_HANDLE; VmaAllocator m_VMAAllocator = VK_NULL_HANDLE; VkQueue m_GraphicsQueue = VK_NULL_HANDLE; uint32_t m_GraphicsQueueFamilyIndex = std::numeric_limits::max(); std::unique_ptr m_SwapChain; std::unique_ptr m_BackbufferReadbackTexture; uint32_t m_FrameID = 0; struct ObjectToDestroy { uint32_t frameID; VkObjectType type; uint64_t handle; VmaAllocation allocation; }; std::queue m_ObjectToDestroyQueue; std::queue> m_TextureToDestroyQueue; std::unique_ptr m_RenderPassManager; std::unique_ptr m_SamplerManager; std::unique_ptr m_DescriptorManager; std::unique_ptr m_SubmitScheduler; DeviceObjectUID m_LastAvailableUID{1}; }; } // namespace Vulkan } // namespace Backend } // namespace Renderer #endif // INCLUDED_RENDERER_BACKEND_VULKAN_DEVICE Index: ps/trunk/source/renderer/backend/vulkan/PipelineState.h =================================================================== --- ps/trunk/source/renderer/backend/vulkan/PipelineState.h (revision 28009) +++ ps/trunk/source/renderer/backend/vulkan/PipelineState.h (revision 28010) @@ -1,96 +1,128 @@ -/* Copyright (C) 2023 Wildfire Games. +/* Copyright (C) 2024 Wildfire Games. * This file is part of 0 A.D. * * 0 A.D. is free software: you can redistribute it and/or modify * it under the terms of the GNU General Public License as published by * the Free Software Foundation, either version 2 of the License, or * (at your option) any later version. * * 0 A.D. is distributed in the hope that it will be useful, * but WITHOUT ANY WARRANTY; without even the implied warranty of * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the * GNU General Public License for more details. * * You should have received a copy of the GNU General Public License * along with 0 A.D. If not, see . */ #ifndef INCLUDED_RENDERER_BACKEND_VULKAN_PIPELINESTATE #define INCLUDED_RENDERER_BACKEND_VULKAN_PIPELINESTATE #include "renderer/backend/PipelineState.h" #include "renderer/backend/vulkan/Framebuffer.h" #include "renderer/backend/vulkan/ShaderProgram.h" #include "renderer/backend/vulkan/DeviceObjectUID.h" #include #include #include #include namespace Renderer { namespace Backend { namespace Vulkan { class CDevice; class CFramebuffer; class CGraphicsPipelineState final : public IGraphicsPipelineState { public: ~CGraphicsPipelineState() override; IDevice* GetDevice() override; IShaderProgram* GetShaderProgram() const override { return m_Desc.shaderProgram; } const SGraphicsPipelineStateDesc& GetDesc() const { return m_Desc; } VkPipeline GetOrCreatePipeline( const CVertexInputLayout* vertexInputLayout, CFramebuffer* framebuffer); DeviceObjectUID GetUID() const { return m_UID; } private: friend class CDevice; static std::unique_ptr Create( CDevice* device, const SGraphicsPipelineStateDesc& desc); CGraphicsPipelineState() = default; CDevice* m_Device = nullptr; DeviceObjectUID m_UID{INVALID_DEVICE_OBJECT_UID}; SGraphicsPipelineStateDesc m_Desc{}; struct CacheKey { DeviceObjectUID vertexInputLayoutUID; // TODO: try to replace the UID by the only required parameters. DeviceObjectUID framebufferUID; }; struct CacheKeyHash { size_t operator()(const CacheKey& cacheKey) const; }; struct CacheKeyEqual { bool operator()(const CacheKey& lhs, const CacheKey& rhs) const; }; std::unordered_map m_PipelineMap; }; +class CComputePipelineState final : public IComputePipelineState +{ +public: + ~CComputePipelineState() override; + + IDevice* GetDevice() override; + + IShaderProgram* GetShaderProgram() const override { return m_Desc.shaderProgram; } + + const SComputePipelineStateDesc& GetDesc() const { return m_Desc; } + + VkPipeline GetPipeline() { return m_Pipeline; } + + DeviceObjectUID GetUID() const { return m_UID; } + +private: + friend class CDevice; + + static std::unique_ptr Create( + CDevice* device, const SComputePipelineStateDesc& desc); + + CComputePipelineState() = default; + + CDevice* m_Device{nullptr}; + + DeviceObjectUID m_UID{INVALID_DEVICE_OBJECT_UID}; + + SComputePipelineStateDesc m_Desc{}; + + VkPipeline m_Pipeline{VK_NULL_HANDLE}; +}; + } // namespace Vulkan } // namespace Backend } // namespace Renderer #endif // INCLUDED_RENDERER_BACKEND_VULKAN_PIPELINESTATE Index: ps/trunk/source/renderer/backend/gl/PipelineState.cpp =================================================================== --- ps/trunk/source/renderer/backend/gl/PipelineState.cpp (revision 28009) +++ ps/trunk/source/renderer/backend/gl/PipelineState.cpp (revision 28010) @@ -1,52 +1,67 @@ -/* Copyright (C) 2022 Wildfire Games. +/* Copyright (C) 2024 Wildfire Games. * This file is part of 0 A.D. * * 0 A.D. is free software: you can redistribute it and/or modify * it under the terms of the GNU General Public License as published by * the Free Software Foundation, either version 2 of the License, or * (at your option) any later version. * * 0 A.D. is distributed in the hope that it will be useful, * but WITHOUT ANY WARRANTY; without even the implied warranty of * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the * GNU General Public License for more details. * * You should have received a copy of the GNU General Public License * along with 0 A.D. If not, see . */ #include "precompiled.h" #include "PipelineState.h" #include "renderer/backend/gl/Device.h" namespace Renderer { namespace Backend { namespace GL { // static std::unique_ptr CGraphicsPipelineState::Create( CDevice* device, const SGraphicsPipelineStateDesc& desc) { std::unique_ptr pipelineState{new CGraphicsPipelineState()}; pipelineState->m_Device = device; pipelineState->m_Desc = desc; return pipelineState; } IDevice* CGraphicsPipelineState::GetDevice() { return m_Device; } +// static +std::unique_ptr CComputePipelineState::Create( + CDevice* device, const SComputePipelineStateDesc& desc) +{ + std::unique_ptr pipelineState{new CComputePipelineState()}; + pipelineState->m_Device = device; + pipelineState->m_Desc = desc; + return pipelineState; +} + +IDevice* CComputePipelineState::GetDevice() +{ + return m_Device; +} + } // namespace GL } // namespace Backend } // namespace Renderer Index: ps/trunk/source/renderer/backend/vulkan/Device.cpp =================================================================== --- ps/trunk/source/renderer/backend/vulkan/Device.cpp (revision 28009) +++ ps/trunk/source/renderer/backend/vulkan/Device.cpp (revision 28010) @@ -1,1047 +1,1053 @@ -/* Copyright (C) 2023 Wildfire Games. +/* Copyright (C) 2024 Wildfire Games. * This file is part of 0 A.D. * * 0 A.D. is free software: you can redistribute it and/or modify * it under the terms of the GNU General Public License as published by * the Free Software Foundation, either version 2 of the License, or * (at your option) any later version. * * 0 A.D. is distributed in the hope that it will be useful, * but WITHOUT ANY WARRANTY; without even the implied warranty of * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the * GNU General Public License for more details. * * You should have received a copy of the GNU General Public License * along with 0 A.D. If not, see . */ #include "precompiled.h" #include "Device.h" #include "lib/external_libraries/libsdl.h" #include "lib/hash.h" #include "lib/sysdep/os.h" #include "maths/MathUtil.h" #include "ps/CLogger.h" #include "ps/ConfigDB.h" #include "ps/Profile.h" #include "renderer/backend/vulkan/Buffer.h" #include "renderer/backend/vulkan/DescriptorManager.h" #include "renderer/backend/vulkan/DeviceCommandContext.h" #include "renderer/backend/vulkan/DeviceSelection.h" #include "renderer/backend/vulkan/Framebuffer.h" #include "renderer/backend/vulkan/Mapping.h" #include "renderer/backend/vulkan/PipelineState.h" #include "renderer/backend/vulkan/RenderPassManager.h" #include "renderer/backend/vulkan/RingCommandContext.h" #include "renderer/backend/vulkan/SamplerManager.h" #include "renderer/backend/vulkan/ShaderProgram.h" #include "renderer/backend/vulkan/SubmitScheduler.h" #include "renderer/backend/vulkan/SwapChain.h" #include "renderer/backend/vulkan/Texture.h" #include "renderer/backend/vulkan/Utilities.h" #include "scriptinterface/JSON.h" #include "scriptinterface/Object.h" #include "scriptinterface/ScriptInterface.h" #include "scriptinterface/ScriptRequest.h" #include #include #include #include #include #include #include // According to https://wiki.libsdl.org/SDL_Vulkan_LoadLibrary the following // functionality is supported since SDL 2.0.6. #if SDL_VERSION_ATLEAST(2, 0, 6) #include #endif namespace Renderer { namespace Backend { namespace Vulkan { namespace { std::vector GetRequiredSDLExtensions(SDL_Window* window) { if (!window) return {}; const size_t MAX_EXTENSION_COUNT = 16; unsigned int SDLExtensionCount = MAX_EXTENSION_COUNT; const char* SDLExtensions[MAX_EXTENSION_COUNT]; ENSURE(SDL_Vulkan_GetInstanceExtensions(window, &SDLExtensionCount, SDLExtensions)); std::vector requiredExtensions; requiredExtensions.reserve(SDLExtensionCount); std::copy_n(SDLExtensions, SDLExtensionCount, std::back_inserter(requiredExtensions)); return requiredExtensions; } std::vector GetAvailableValidationLayers() { uint32_t layerCount = 0; ENSURE_VK_SUCCESS(vkEnumerateInstanceLayerProperties(&layerCount, nullptr)); std::vector availableLayers(layerCount); ENSURE_VK_SUCCESS(vkEnumerateInstanceLayerProperties(&layerCount, availableLayers.data())); for (const VkLayerProperties& layer : availableLayers) { LOGMESSAGE("Vulkan validation layer: '%s' (%s) v%u.%u.%u.%u", layer.layerName, layer.description, VK_API_VERSION_VARIANT(layer.specVersion), VK_API_VERSION_MAJOR(layer.specVersion), VK_API_VERSION_MINOR(layer.specVersion), VK_API_VERSION_PATCH(layer.specVersion)); } std::vector availableValidationLayers; availableValidationLayers.reserve(layerCount); for (const VkLayerProperties& layer : availableLayers) availableValidationLayers.emplace_back(layer.layerName); return availableValidationLayers; } std::vector GetAvailableInstanceExtensions(const char* layerName = nullptr) { uint32_t extensionCount = 0; ENSURE_VK_SUCCESS(vkEnumerateInstanceExtensionProperties(layerName, &extensionCount, nullptr)); std::vector extensions(extensionCount); ENSURE_VK_SUCCESS(vkEnumerateInstanceExtensionProperties(layerName, &extensionCount, extensions.data())); std::vector availableExtensions; for (const VkExtensionProperties& extension : extensions) availableExtensions.emplace_back(extension.extensionName); return availableExtensions; } VKAPI_ATTR VkBool32 VKAPI_CALL DebugCallback( VkDebugUtilsMessageSeverityFlagBitsEXT messageSeverity, VkDebugUtilsMessageTypeFlagsEXT messageType, const VkDebugUtilsMessengerCallbackDataEXT* callbackData, void* UNUSED(userData)) { if ((messageSeverity & VK_DEBUG_UTILS_MESSAGE_SEVERITY_VERBOSE_BIT_EXT) || (messageSeverity & VK_DEBUG_UTILS_MESSAGE_SEVERITY_INFO_BIT_EXT)) LOGMESSAGE("Vulkan: %s", callbackData->pMessage); else if (messageSeverity & VK_DEBUG_UTILS_MESSAGE_SEVERITY_WARNING_BIT_EXT) { struct HideRule { VkDebugUtilsMessageTypeFlagsEXT flags; std::string_view pattern; bool skip; }; constexpr HideRule hideRules[] = { // Not consumed shader output is a known problem which produces too // many warning. {VK_DEBUG_UTILS_MESSAGE_TYPE_PERFORMANCE_BIT_EXT | VK_DEBUG_UTILS_MESSAGE_TYPE_VALIDATION_BIT_EXT, "OutputNotConsumed", false}, // TODO: check vkGetImageMemoryRequirements2 for prefersDedicatedAllocation. {VK_DEBUG_UTILS_MESSAGE_TYPE_PERFORMANCE_BIT_EXT, "vkBindMemory-small-dedicated-allocation", false}, {VK_DEBUG_UTILS_MESSAGE_TYPE_PERFORMANCE_BIT_EXT, "vkAllocateMemory-small-allocation", false}, // We have some unnecessary clears which were needed for GL. // Ignore message for now, because they're spawned each frame. {VK_DEBUG_UTILS_MESSAGE_TYPE_PERFORMANCE_BIT_EXT, "ClearCmdBeforeDraw", true}, {VK_DEBUG_UTILS_MESSAGE_TYPE_PERFORMANCE_BIT_EXT, "vkCmdClearAttachments-clear-after-load", true}, // TODO: investigate probably false-positive report. {VK_DEBUG_UTILS_MESSAGE_TYPE_VALIDATION_BIT_EXT, "vkCmdBeginRenderPass-StoreOpDontCareThenLoadOpLoad", true}, }; const auto it = std::find_if(std::begin(hideRules), std::end(hideRules), [messageType, message = std::string_view{callbackData->pMessage}](const HideRule& hideRule) -> bool { return (hideRule.flags & messageType) && message.find(hideRule.pattern) != std::string_view::npos; }); if (it == std::end(hideRules)) LOGWARNING("Vulkan: %s", callbackData->pMessage); else if (!it->skip) LOGMESSAGE("Vulkan: %s", callbackData->pMessage); } else if (messageSeverity & VK_DEBUG_UTILS_MESSAGE_SEVERITY_ERROR_BIT_EXT) LOGERROR("Vulkan: %s", callbackData->pMessage); return VK_FALSE; } // A workaround function to meet calling conventions of Vulkan, SDL and GLAD. GLADapiproc GetInstanceProcAddr(VkInstance instance, const char* name) { #if SDL_VERSION_ATLEAST(2, 0, 6) PFN_vkGetInstanceProcAddr function = reinterpret_cast(SDL_Vulkan_GetVkGetInstanceProcAddr()); return reinterpret_cast(function(instance, name)); #else return nullptr; #endif } } // anonymous namespace // static std::unique_ptr CDevice::Create(SDL_Window* window) { if (!window) { LOGERROR("Can't create Vulkan device without window."); return nullptr; } GLADuserptrloadfunc gladLoadFunction = reinterpret_cast(GetInstanceProcAddr); std::unique_ptr device(new CDevice()); device->m_Window = window; #ifdef NDEBUG bool enableDebugMessages = false; CFG_GET_VAL("renderer.backend.debugmessages", enableDebugMessages); bool enableDebugLabels = false; CFG_GET_VAL("renderer.backend.debuglabels", enableDebugLabels); bool enableDebugScopedLabels = false; CFG_GET_VAL("renderer.backend.debugscopedlabels", enableDebugScopedLabels); #else bool enableDebugMessages = true; bool enableDebugLabels = true; bool enableDebugScopedLabels = true; #endif int gladVulkanVersion = gladLoadVulkanUserPtr(nullptr, gladLoadFunction, nullptr); if (!gladVulkanVersion) { LOGERROR("GLAD unable to load vulkan."); return nullptr; } VkApplicationInfo applicationInfo{}; applicationInfo.sType = VK_STRUCTURE_TYPE_APPLICATION_INFO; applicationInfo.pApplicationName = "0 A.D."; applicationInfo.applicationVersion = VK_MAKE_VERSION(0, 0, 27); applicationInfo.pEngineName = "Pyrogenesis"; applicationInfo.engineVersion = applicationInfo.applicationVersion; applicationInfo.apiVersion = VK_API_VERSION_1_1; std::vector requiredInstanceExtensions = GetRequiredSDLExtensions(window); device->m_ValidationLayers = GetAvailableValidationLayers(); auto hasValidationLayer = [&layers = device->m_ValidationLayers](const char* name) -> bool { return std::find(layers.begin(), layers.end(), name) != layers.end(); }; device->m_InstanceExtensions = GetAvailableInstanceExtensions(); auto hasInstanceExtension = [&extensions = device->m_InstanceExtensions](const char* name) -> bool { return std::find(extensions.begin(), extensions.end(), name) != extensions.end(); }; #ifdef NDEBUG bool enableDebugContext = false; CFG_GET_VAL("renderer.backend.debugcontext", enableDebugContext); #else bool enableDebugContext = true; #endif if (!hasInstanceExtension(VK_EXT_DEBUG_UTILS_EXTENSION_NAME)) enableDebugMessages = enableDebugLabels = enableDebugScopedLabels = false; const bool enableDebugLayers = enableDebugContext || enableDebugMessages || enableDebugLabels || enableDebugScopedLabels; if (enableDebugLayers) requiredInstanceExtensions.emplace_back(VK_EXT_DEBUG_UTILS_EXTENSION_NAME); std::vector requestedValidationLayers; const bool enableValidationFeatures = enableDebugMessages && hasValidationLayer("VK_LAYER_KHRONOS_validation"); if (enableValidationFeatures) requestedValidationLayers.emplace_back("VK_LAYER_KHRONOS_validation"); // https://github.com/KhronosGroup/Vulkan-ValidationLayers/blob/master/docs/synchronization_usage.md VkValidationFeatureEnableEXT validationFeatureEnables[] = { VK_VALIDATION_FEATURE_ENABLE_BEST_PRACTICES_EXT, VK_VALIDATION_FEATURE_ENABLE_SYNCHRONIZATION_VALIDATION_EXT }; VkValidationFeaturesEXT validationFeatures{}; validationFeatures.sType = VK_STRUCTURE_TYPE_VALIDATION_FEATURES_EXT; validationFeatures.enabledValidationFeatureCount = std::size(validationFeatureEnables); validationFeatures.pEnabledValidationFeatures = validationFeatureEnables; VkInstanceCreateInfo instanceCreateInfo{}; instanceCreateInfo.sType = VK_STRUCTURE_TYPE_INSTANCE_CREATE_INFO; instanceCreateInfo.pApplicationInfo = &applicationInfo; instanceCreateInfo.enabledExtensionCount = requiredInstanceExtensions.size(); instanceCreateInfo.ppEnabledExtensionNames = requiredInstanceExtensions.data(); if (requestedValidationLayers.empty()) { instanceCreateInfo.enabledLayerCount = 0; instanceCreateInfo.ppEnabledLayerNames = nullptr; } else { instanceCreateInfo.enabledLayerCount = requestedValidationLayers.size(); instanceCreateInfo.ppEnabledLayerNames = requestedValidationLayers.data(); } // Enabling validation features might significantly reduce performance, // even more than the standard validation layer. if (enableValidationFeatures && enableDebugContext) { instanceCreateInfo.pNext = &validationFeatures; } const VkResult createInstanceResult = vkCreateInstance(&instanceCreateInfo, nullptr, &device->m_Instance); if (createInstanceResult != VK_SUCCESS) { if (createInstanceResult == VK_ERROR_INCOMPATIBLE_DRIVER) LOGERROR("Can't create Vulkan instance: incompatible driver."); else if (createInstanceResult == VK_ERROR_EXTENSION_NOT_PRESENT) LOGERROR("Can't create Vulkan instance: extension not present."); else if (createInstanceResult == VK_ERROR_LAYER_NOT_PRESENT) LOGERROR("Can't create Vulkan instance: layer not present."); else LOGERROR("Unknown error during Vulkan instance creation: %d (%s)", static_cast(createInstanceResult), Utilities::GetVkResultName(createInstanceResult)); return nullptr; } gladVulkanVersion = gladLoadVulkanUserPtr(nullptr, gladLoadFunction, device->m_Instance); if (!gladVulkanVersion) { LOGERROR("GLAD unable to re-load vulkan after its instance creation."); return nullptr; } if (GLAD_VK_EXT_debug_utils && enableDebugMessages) { VkDebugUtilsMessengerCreateInfoEXT debugCreateInfo{}; debugCreateInfo.sType = VK_STRUCTURE_TYPE_DEBUG_UTILS_MESSENGER_CREATE_INFO_EXT; debugCreateInfo.messageSeverity = VK_DEBUG_UTILS_MESSAGE_SEVERITY_VERBOSE_BIT_EXT | VK_DEBUG_UTILS_MESSAGE_SEVERITY_WARNING_BIT_EXT | VK_DEBUG_UTILS_MESSAGE_SEVERITY_ERROR_BIT_EXT; debugCreateInfo.messageType = VK_DEBUG_UTILS_MESSAGE_TYPE_GENERAL_BIT_EXT | VK_DEBUG_UTILS_MESSAGE_TYPE_VALIDATION_BIT_EXT | VK_DEBUG_UTILS_MESSAGE_TYPE_PERFORMANCE_BIT_EXT; debugCreateInfo.pfnUserCallback = DebugCallback; debugCreateInfo.pUserData = nullptr; ENSURE_VK_SUCCESS(vkCreateDebugUtilsMessengerEXT( device->m_Instance, &debugCreateInfo, nullptr, &device->m_DebugMessenger)); } if (window) ENSURE(SDL_Vulkan_CreateSurface(window, device->m_Instance, &device->m_Surface)); const std::vector requiredDeviceExtensions = { VK_KHR_SWAPCHAIN_EXTENSION_NAME }; std::vector availablePhyscialDevices = GetAvailablePhysicalDevices(device->m_Instance, device->m_Surface, requiredDeviceExtensions); for (const SAvailablePhysicalDevice& device : availablePhyscialDevices) { LOGMESSAGE("Vulkan available device: '%s' Type: %u Supported: %c", device.properties.deviceName, static_cast(device.properties.deviceType), IsPhysicalDeviceUnsupported(device) ? 'N' : 'Y'); LOGMESSAGE(" ID: %u VendorID: %u API Version: %u Driver Version: %u", device.properties.deviceID, device.properties.vendorID, device.properties.apiVersion, device.properties.driverVersion); LOGMESSAGE(" hasRequiredExtensions: %c hasOutputToSurfaceSupport: %c", device.hasRequiredExtensions ? 'Y' : 'N', device.hasOutputToSurfaceSupport ? 'Y' : 'N'); LOGMESSAGE(" graphicsQueueFamilyIndex: %u presentQueueFamilyIndex: %u families: %zu", device.graphicsQueueFamilyIndex, device.presentQueueFamilyIndex, device.queueFamilies.size()); LOGMESSAGE(" maxBoundDescriptorSets: %u", device.properties.limits.maxBoundDescriptorSets); for (const VkSurfaceFormatKHR& surfaceFormat : device.surfaceFormats) { LOGMESSAGE(" Surface format: %u colorSpace: %u Supported: %c", static_cast(surfaceFormat.format), static_cast(surfaceFormat.colorSpace), IsSurfaceFormatSupported(surfaceFormat) ? 'Y' : 'N'); } for (uint32_t memoryTypeIndex = 0; memoryTypeIndex < device.memoryProperties.memoryTypeCount; ++memoryTypeIndex) { const VkMemoryType& type = device.memoryProperties.memoryTypes[memoryTypeIndex]; LOGMESSAGE(" Memory Type Index: %u Flags: %u Heap Index: %u", memoryTypeIndex, static_cast(type.propertyFlags), type.heapIndex); } for (uint32_t memoryHeapIndex = 0; memoryHeapIndex < device.memoryProperties.memoryHeapCount; ++memoryHeapIndex) { const VkMemoryHeap& heap = device.memoryProperties.memoryHeaps[memoryHeapIndex]; LOGMESSAGE(" Memory Heap Index: %u Size: %zu Flags: %u", memoryHeapIndex, static_cast(heap.size / 1024), static_cast(heap.flags)); } } device->m_AvailablePhysicalDevices = availablePhyscialDevices; // We need to remove unsupported devices first. availablePhyscialDevices.erase( std::remove_if( availablePhyscialDevices.begin(), availablePhyscialDevices.end(), IsPhysicalDeviceUnsupported), availablePhyscialDevices.end()); if (availablePhyscialDevices.empty()) { LOGERROR("Vulkan can not find any supported and suitable device."); return nullptr; } int deviceIndexOverride = -1; CFG_GET_VAL("renderer.backend.vulkan.deviceindexoverride", deviceIndexOverride); auto choosedDeviceIt = device->m_AvailablePhysicalDevices.end(); if (deviceIndexOverride >= 0) { choosedDeviceIt = std::find_if( device->m_AvailablePhysicalDevices.begin(), device->m_AvailablePhysicalDevices.end(), [deviceIndexOverride](const SAvailablePhysicalDevice& availableDevice) { return availableDevice.index == static_cast(deviceIndexOverride); }); if (choosedDeviceIt == device->m_AvailablePhysicalDevices.end()) LOGWARNING("Device with override index %d not found.", deviceIndexOverride); } if (choosedDeviceIt == device->m_AvailablePhysicalDevices.end()) { // We need to choose the best available device fits our needs. choosedDeviceIt = min_element( availablePhyscialDevices.begin(), availablePhyscialDevices.end(), ComparePhysicalDevices); } device->m_ChoosenDevice = *choosedDeviceIt; const SAvailablePhysicalDevice& choosenDevice = device->m_ChoosenDevice; device->m_AvailablePhysicalDevices.erase(std::remove_if( device->m_AvailablePhysicalDevices.begin(), device->m_AvailablePhysicalDevices.end(), [physicalDevice = choosenDevice.device](const SAvailablePhysicalDevice& device) { return physicalDevice == device.device; }), device->m_AvailablePhysicalDevices.end()); gladVulkanVersion = gladLoadVulkanUserPtr(choosenDevice.device, gladLoadFunction, device->m_Instance); if (!gladVulkanVersion) { LOGERROR("GLAD unable to re-load vulkan after choosing its physical device."); return nullptr; } #if !OS_MACOSX auto hasDeviceExtension = [&extensions = choosenDevice.extensions](const char* name) -> bool { return std::find(extensions.begin(), extensions.end(), name) != extensions.end(); }; const bool hasDescriptorIndexing = hasDeviceExtension(VK_EXT_DESCRIPTOR_INDEXING_EXTENSION_NAME); #else // Metal on macOS doesn't support combined samplers natively. Currently // they break compiling SPIR-V shaders with descriptor indexing into MTL // shaders when using MoltenVK. const bool hasDescriptorIndexing = false; #endif const bool hasNeededDescriptorIndexingFeatures = hasDescriptorIndexing && choosenDevice.descriptorIndexingProperties.maxUpdateAfterBindDescriptorsInAllPools >= 65536 && choosenDevice.descriptorIndexingFeatures.shaderSampledImageArrayNonUniformIndexing && choosenDevice.descriptorIndexingFeatures.runtimeDescriptorArray && choosenDevice.descriptorIndexingFeatures.descriptorBindingVariableDescriptorCount && choosenDevice.descriptorIndexingFeatures.descriptorBindingPartiallyBound && choosenDevice.descriptorIndexingFeatures.descriptorBindingUpdateUnusedWhilePending && choosenDevice.descriptorIndexingFeatures.descriptorBindingSampledImageUpdateAfterBind; std::vector deviceExtensions = requiredDeviceExtensions; if (hasDescriptorIndexing) deviceExtensions.emplace_back(VK_EXT_DESCRIPTOR_INDEXING_EXTENSION_NAME); device->m_GraphicsQueueFamilyIndex = choosenDevice.graphicsQueueFamilyIndex; const std::array queueFamilyIndices{{ choosenDevice.graphicsQueueFamilyIndex }}; PS::StaticVector queueCreateInfos; const float queuePriority = 1.0f; std::transform(queueFamilyIndices.begin(), queueFamilyIndices.end(), std::back_inserter(queueCreateInfos), [&queuePriority](const size_t queueFamilyIndex) { VkDeviceQueueCreateInfo queueCreateInfo{}; queueCreateInfo.sType = VK_STRUCTURE_TYPE_DEVICE_QUEUE_CREATE_INFO; queueCreateInfo.pQueuePriorities = &queuePriority; queueCreateInfo.queueCount = 1; queueCreateInfo.queueFamilyIndex = queueFamilyIndex; return queueCreateInfo; }); // https://github.com/KhronosGroup/Vulkan-Guide/blob/master/chapters/enabling_features.adoc VkPhysicalDeviceFeatures deviceFeatures{}; VkPhysicalDeviceFeatures2 deviceFeatures2{}; VkPhysicalDeviceDescriptorIndexingFeaturesEXT descriptorIndexingFeatures{}; deviceFeatures.textureCompressionBC = choosenDevice.features.textureCompressionBC; deviceFeatures.samplerAnisotropy = choosenDevice.features.samplerAnisotropy; deviceFeatures.fillModeNonSolid = choosenDevice.features.fillModeNonSolid; descriptorIndexingFeatures.sType = VK_STRUCTURE_TYPE_PHYSICAL_DEVICE_DESCRIPTOR_INDEXING_FEATURES_EXT; descriptorIndexingFeatures.shaderSampledImageArrayNonUniformIndexing = choosenDevice.descriptorIndexingFeatures.shaderSampledImageArrayNonUniformIndexing; descriptorIndexingFeatures.runtimeDescriptorArray = choosenDevice.descriptorIndexingFeatures.runtimeDescriptorArray; descriptorIndexingFeatures.descriptorBindingVariableDescriptorCount = choosenDevice.descriptorIndexingFeatures.descriptorBindingVariableDescriptorCount; descriptorIndexingFeatures.descriptorBindingPartiallyBound = choosenDevice.descriptorIndexingFeatures.descriptorBindingPartiallyBound; descriptorIndexingFeatures.descriptorBindingUpdateUnusedWhilePending = choosenDevice.descriptorIndexingFeatures.descriptorBindingUpdateUnusedWhilePending; descriptorIndexingFeatures.descriptorBindingSampledImageUpdateAfterBind = choosenDevice.descriptorIndexingFeatures.descriptorBindingSampledImageUpdateAfterBind; deviceFeatures2.sType = VK_STRUCTURE_TYPE_PHYSICAL_DEVICE_FEATURES_2; deviceFeatures2.features = deviceFeatures; if (hasNeededDescriptorIndexingFeatures) deviceFeatures2.pNext = &descriptorIndexingFeatures; VkDeviceCreateInfo deviceCreateInfo{}; deviceCreateInfo.sType = VK_STRUCTURE_TYPE_DEVICE_CREATE_INFO; deviceCreateInfo.queueCreateInfoCount = queueCreateInfos.size(); deviceCreateInfo.pQueueCreateInfos = queueCreateInfos.data(); deviceCreateInfo.enabledExtensionCount = deviceExtensions.size(); deviceCreateInfo.ppEnabledExtensionNames = deviceExtensions.data(); deviceCreateInfo.pEnabledFeatures = nullptr; deviceCreateInfo.pNext = &deviceFeatures2; deviceCreateInfo.enabledLayerCount = 0; deviceCreateInfo.ppEnabledLayerNames = nullptr; const VkResult createDeviceResult = vkCreateDevice( choosenDevice.device, &deviceCreateInfo, nullptr, &device->m_Device); if (createDeviceResult != VK_SUCCESS) { if (createDeviceResult == VK_ERROR_FEATURE_NOT_PRESENT) LOGERROR("Can't create Vulkan device: feature not present."); else if (createDeviceResult == VK_ERROR_EXTENSION_NOT_PRESENT) LOGERROR("Can't create Vulkan device: extension not present."); else LOGERROR("Unknown error during Vulkan device creation: %d (%s)", static_cast(createDeviceResult), Utilities::GetVkResultName(createDeviceResult)); return nullptr; } VmaVulkanFunctions vulkanFunctions{}; vulkanFunctions.vkGetInstanceProcAddr = vkGetInstanceProcAddr; vulkanFunctions.vkGetDeviceProcAddr = vkGetDeviceProcAddr; vulkanFunctions.vkGetPhysicalDeviceProperties = vkGetPhysicalDeviceProperties; vulkanFunctions.vkGetPhysicalDeviceMemoryProperties = vkGetPhysicalDeviceMemoryProperties; vulkanFunctions.vkAllocateMemory = vkAllocateMemory; vulkanFunctions.vkFreeMemory = vkFreeMemory; vulkanFunctions.vkMapMemory = vkMapMemory; vulkanFunctions.vkUnmapMemory = vkUnmapMemory; vulkanFunctions.vkFlushMappedMemoryRanges = vkFlushMappedMemoryRanges; vulkanFunctions.vkInvalidateMappedMemoryRanges = vkInvalidateMappedMemoryRanges; vulkanFunctions.vkBindBufferMemory = vkBindBufferMemory; vulkanFunctions.vkBindImageMemory = vkBindImageMemory; vulkanFunctions.vkGetBufferMemoryRequirements = vkGetBufferMemoryRequirements; vulkanFunctions.vkGetImageMemoryRequirements = vkGetImageMemoryRequirements; vulkanFunctions.vkCreateBuffer = vkCreateBuffer; vulkanFunctions.vkDestroyBuffer = vkDestroyBuffer; vulkanFunctions.vkCreateImage = vkCreateImage; vulkanFunctions.vkDestroyImage = vkDestroyImage; vulkanFunctions.vkCmdCopyBuffer = vkCmdCopyBuffer; // Functions promoted to Vulkan 1.1. vulkanFunctions.vkGetBufferMemoryRequirements2KHR = vkGetBufferMemoryRequirements2; vulkanFunctions.vkGetImageMemoryRequirements2KHR = vkGetImageMemoryRequirements2; vulkanFunctions.vkBindBufferMemory2KHR = vkBindBufferMemory2; vulkanFunctions.vkBindImageMemory2KHR = vkBindImageMemory2; vulkanFunctions.vkGetPhysicalDeviceMemoryProperties2KHR = vkGetPhysicalDeviceMemoryProperties2; VmaAllocatorCreateInfo allocatorCreateInfo{}; allocatorCreateInfo.instance = device->m_Instance; allocatorCreateInfo.physicalDevice = choosenDevice.device; allocatorCreateInfo.device = device->m_Device; allocatorCreateInfo.vulkanApiVersion = applicationInfo.apiVersion; allocatorCreateInfo.pVulkanFunctions = &vulkanFunctions; const VkResult createVMAAllocatorResult = vmaCreateAllocator(&allocatorCreateInfo, &device->m_VMAAllocator); if (createVMAAllocatorResult != VK_SUCCESS) { LOGERROR("Failed to create VMA allocator: %d (%s)", static_cast(createVMAAllocatorResult), Utilities::GetVkResultName(createVMAAllocatorResult)); return nullptr; } // We need to use VK_SHARING_MODE_CONCURRENT if we have graphics and present // in different queues. vkGetDeviceQueue(device->m_Device, choosenDevice.graphicsQueueFamilyIndex, 0, &device->m_GraphicsQueue); ENSURE(device->m_GraphicsQueue != VK_NULL_HANDLE); Capabilities& capabilities = device->m_Capabilities; capabilities.debugLabels = enableDebugLabels; capabilities.debugScopedLabels = enableDebugScopedLabels; capabilities.S3TC = choosenDevice.features.textureCompressionBC; capabilities.ARBShaders = false; capabilities.ARBShadersShadow = false; capabilities.computeShaders = true; capabilities.instancing = true; capabilities.maxSampleCount = 1; const VkSampleCountFlags sampleCountFlags = choosenDevice.properties.limits.framebufferColorSampleCounts & choosenDevice.properties.limits.framebufferDepthSampleCounts & choosenDevice.properties.limits.framebufferStencilSampleCounts; const std::array allowedSampleCountBits = { VK_SAMPLE_COUNT_1_BIT, VK_SAMPLE_COUNT_2_BIT, VK_SAMPLE_COUNT_4_BIT, VK_SAMPLE_COUNT_8_BIT, VK_SAMPLE_COUNT_16_BIT, }; for (size_t index = 0; index < allowedSampleCountBits.size(); ++index) if (sampleCountFlags & allowedSampleCountBits[index]) device->m_Capabilities.maxSampleCount = 1u << index; capabilities.multisampling = device->m_Capabilities.maxSampleCount > 1; capabilities.anisotropicFiltering = choosenDevice.features.samplerAnisotropy; capabilities.maxAnisotropy = choosenDevice.properties.limits.maxSamplerAnisotropy; capabilities.maxTextureSize = choosenDevice.properties.limits.maxImageDimension2D; device->m_RenderPassManager = std::make_unique(device.get()); device->m_SamplerManager = std::make_unique(device.get()); device->m_SubmitScheduler = std::make_unique( device.get(), device->m_GraphicsQueueFamilyIndex, device->m_GraphicsQueue); bool disableDescriptorIndexing = false; CFG_GET_VAL("renderer.backend.vulkan.disabledescriptorindexing", disableDescriptorIndexing); const bool useDescriptorIndexing = hasNeededDescriptorIndexingFeatures && !disableDescriptorIndexing; device->m_DescriptorManager = std::make_unique(device.get(), useDescriptorIndexing); device->RecreateSwapChain(); device->m_Name = choosenDevice.properties.deviceName; device->m_Version = std::to_string(VK_API_VERSION_VARIANT(choosenDevice.properties.apiVersion)) + "." + std::to_string(VK_API_VERSION_MAJOR(choosenDevice.properties.apiVersion)) + "." + std::to_string(VK_API_VERSION_MINOR(choosenDevice.properties.apiVersion)) + "." + std::to_string(VK_API_VERSION_PATCH(choosenDevice.properties.apiVersion)); device->m_DriverInformation = std::to_string(choosenDevice.properties.driverVersion); // Refs: // * https://www.khronos.org/registry/vulkan/specs/1.3-extensions/man/html/VkPhysicalDeviceProperties.html // * https://pcisig.com/membership/member-companies device->m_VendorID = std::to_string(choosenDevice.properties.vendorID); device->m_Extensions = choosenDevice.extensions; return device; } CDevice::CDevice() = default; CDevice::~CDevice() { if (m_Device) vkDeviceWaitIdle(m_Device); // The order of destroying does matter to avoid use-after-free and validation // layers complaints. m_BackbufferReadbackTexture.reset(); m_SubmitScheduler.reset(); ProcessTextureToDestroyQueue(true); m_RenderPassManager.reset(); m_SamplerManager.reset(); m_DescriptorManager.reset(); m_SwapChain.reset(); ProcessObjectToDestroyQueue(true); if (m_VMAAllocator != VK_NULL_HANDLE) vmaDestroyAllocator(m_VMAAllocator); if (m_Device != VK_NULL_HANDLE) vkDestroyDevice(m_Device, nullptr); if (m_Surface != VK_NULL_HANDLE) vkDestroySurfaceKHR(m_Instance, m_Surface, nullptr); if (GLAD_VK_EXT_debug_utils && m_DebugMessenger) vkDestroyDebugUtilsMessengerEXT(m_Instance, m_DebugMessenger, nullptr); if (m_Instance != VK_NULL_HANDLE) vkDestroyInstance(m_Instance, nullptr); } void CDevice::Report(const ScriptRequest& rq, JS::HandleValue settings) { Script::SetProperty(rq, settings, "name", "vulkan"); Script::SetProperty(rq, settings, "extensions", m_Extensions); JS::RootedValue device(rq.cx); Script::CreateObject(rq, &device); ReportAvailablePhysicalDevice(m_ChoosenDevice, rq, device); Script::SetProperty(rq, settings, "choosen_device", device); JS::RootedValue availableDevices(rq.cx); Script::CreateArray(rq, &availableDevices, m_AvailablePhysicalDevices.size()); for (size_t index = 0; index < m_AvailablePhysicalDevices.size(); ++index) { JS::RootedValue device(rq.cx); Script::CreateObject(rq, &device); ReportAvailablePhysicalDevice(m_AvailablePhysicalDevices[index], rq, device); Script::SetPropertyInt(rq, availableDevices, index, device); } Script::SetProperty(rq, settings, "available_devices", availableDevices); Script::SetProperty(rq, settings, "instance_extensions", m_InstanceExtensions); Script::SetProperty(rq, settings, "validation_layers", m_ValidationLayers); } std::unique_ptr CDevice::CreateGraphicsPipelineState( const SGraphicsPipelineStateDesc& pipelineStateDesc) { return CGraphicsPipelineState::Create(this, pipelineStateDesc); } +std::unique_ptr CDevice::CreateComputePipelineState( + const SComputePipelineStateDesc& pipelineStateDesc) +{ + return CComputePipelineState::Create(this, pipelineStateDesc); +} + std::unique_ptr CDevice::CreateVertexInputLayout( const PS::span attributes) { return std::make_unique(this, attributes); } std::unique_ptr CDevice::CreateTexture( const char* name, const ITexture::Type type, const uint32_t usage, const Format format, const uint32_t width, const uint32_t height, const Sampler::Desc& defaultSamplerDesc, const uint32_t MIPLevelCount, const uint32_t sampleCount) { return CTexture::Create( this, name, type, usage, format, width, height, defaultSamplerDesc, MIPLevelCount, sampleCount); } std::unique_ptr CDevice::CreateTexture2D( const char* name, const uint32_t usage, const Format format, const uint32_t width, const uint32_t height, const Sampler::Desc& defaultSamplerDesc, const uint32_t MIPLevelCount, const uint32_t sampleCount) { return CreateTexture( name, ITexture::Type::TEXTURE_2D, usage, format, width, height, defaultSamplerDesc, MIPLevelCount, sampleCount); } std::unique_ptr CDevice::CreateFramebuffer( const char* name, SColorAttachment* colorAttachment, SDepthStencilAttachment* depthStencilAttachment) { return CFramebuffer::Create( this, name, colorAttachment, depthStencilAttachment); } std::unique_ptr CDevice::CreateBuffer( const char* name, const IBuffer::Type type, const uint32_t size, const bool dynamic) { return CreateCBuffer(name, type, size, dynamic); } std::unique_ptr CDevice::CreateCBuffer( const char* name, const IBuffer::Type type, const uint32_t size, const bool dynamic) { return CBuffer::Create(this, name, type, size, dynamic); } std::unique_ptr CDevice::CreateShaderProgram( const CStr& name, const CShaderDefines& defines) { return CShaderProgram::Create(this, name, defines); } std::unique_ptr CDevice::CreateCommandContext() { return CDeviceCommandContext::Create(this); } bool CDevice::AcquireNextBackbuffer() { if (!IsSwapChainValid()) { vkDeviceWaitIdle(m_Device); RecreateSwapChain(); if (!IsSwapChainValid()) return false; } PROFILE3("AcquireNextBackbuffer"); return m_SubmitScheduler->AcquireNextImage(*m_SwapChain); } IFramebuffer* CDevice::GetCurrentBackbuffer( const AttachmentLoadOp colorAttachmentLoadOp, const AttachmentStoreOp colorAttachmentStoreOp, const AttachmentLoadOp depthStencilAttachmentLoadOp, const AttachmentStoreOp depthStencilAttachmentStoreOp) { return IsSwapChainValid() ? m_SwapChain->GetCurrentBackbuffer( colorAttachmentLoadOp, colorAttachmentStoreOp, depthStencilAttachmentLoadOp, depthStencilAttachmentStoreOp) : nullptr; } void CDevice::Present() { if (!IsSwapChainValid()) return; PROFILE3("Present"); m_SubmitScheduler->Present(*m_SwapChain); ProcessObjectToDestroyQueue(); ProcessTextureToDestroyQueue(); ++m_FrameID; } void CDevice::OnWindowResize(const uint32_t width, const uint32_t height) { if (!IsSwapChainValid() || width != m_SwapChain->GetDepthTexture()->GetWidth() || height != m_SwapChain->GetDepthTexture()->GetHeight()) { RecreateSwapChain(); } } bool CDevice::IsTextureFormatSupported(const Format format) const { switch (format) { case Format::UNDEFINED: return false; case Format::R8G8B8_UNORM: return false; case Format::BC1_RGB_UNORM: FALLTHROUGH; case Format::BC1_RGBA_UNORM: FALLTHROUGH; case Format::BC2_UNORM: FALLTHROUGH; case Format::BC3_UNORM: return m_Capabilities.S3TC; default: break; } VkFormatProperties formatProperties{}; vkGetPhysicalDeviceFormatProperties( m_ChoosenDevice.device, Mapping::FromFormat(format), &formatProperties); return formatProperties.optimalTilingFeatures & VK_FORMAT_FEATURE_SAMPLED_IMAGE_BIT; } bool CDevice::IsFramebufferFormatSupported(const Format format) const { VkFormatProperties formatProperties{}; vkGetPhysicalDeviceFormatProperties( m_ChoosenDevice.device, Mapping::FromFormat(format), &formatProperties); if (IsDepthFormat(format)) return formatProperties.optimalTilingFeatures & VK_FORMAT_FEATURE_DEPTH_STENCIL_ATTACHMENT_BIT; return formatProperties.optimalTilingFeatures & VK_FORMAT_FEATURE_COLOR_ATTACHMENT_BIT; } Format CDevice::GetPreferredDepthStencilFormat( const uint32_t usage, const bool depth, const bool stencil) const { ENSURE(depth || stencil); Format format = Format::UNDEFINED; if (stencil) { // https://github.com/KhronosGroup/Vulkan-Guide/blob/main/chapters/depth.adoc#depth-formats // At least one of VK_FORMAT_D24_UNORM_S8_UINT or VK_FORMAT_D32_SFLOAT_S8_UINT // must also be supported. if (IsFormatSupportedForUsage(Format::D24_UNORM_S8_UINT, usage)) format = Format::D24_UNORM_S8_UINT; else format = Format::D32_SFLOAT_S8_UINT; } else { std::array formatRequestOrder; // TODO: add most known vendors to enum. // https://developer.nvidia.com/blog/vulkan-dos-donts/ if (m_ChoosenDevice.properties.vendorID == 0x10DE) formatRequestOrder = {Format::D24_UNORM, Format::D32_SFLOAT, Format::D16_UNORM}; else formatRequestOrder = {Format::D32_SFLOAT, Format::D24_UNORM, Format::D16_UNORM}; for (const Format formatRequest : formatRequestOrder) if (IsFormatSupportedForUsage(formatRequest, usage)) { format = formatRequest; break; } } return format; } bool CDevice::IsFormatSupportedForUsage(const Format format, const uint32_t usage) const { VkFormatProperties formatProperties{}; vkGetPhysicalDeviceFormatProperties( m_ChoosenDevice.device, Mapping::FromFormat(format), &formatProperties); VkFormatFeatureFlags expectedFeatures = 0; if (usage & ITexture::Usage::COLOR_ATTACHMENT) expectedFeatures |= VK_FORMAT_FEATURE_COLOR_ATTACHMENT_BIT; if (usage & ITexture::Usage::DEPTH_STENCIL_ATTACHMENT) expectedFeatures |= VK_FORMAT_FEATURE_DEPTH_STENCIL_ATTACHMENT_BIT; if (usage & ITexture::Usage::SAMPLED) expectedFeatures |= VK_FORMAT_FEATURE_SAMPLED_IMAGE_BIT; if (usage & ITexture::Usage::TRANSFER_SRC) expectedFeatures |= VK_FORMAT_FEATURE_TRANSFER_SRC_BIT; if (usage & ITexture::Usage::TRANSFER_DST) expectedFeatures |= VK_FORMAT_FEATURE_TRANSFER_DST_BIT; return (formatProperties.optimalTilingFeatures & expectedFeatures) == expectedFeatures; } void CDevice::ScheduleObjectToDestroy( VkObjectType type, const uint64_t handle, const VmaAllocation allocation) { m_ObjectToDestroyQueue.push({m_FrameID, type, handle, allocation}); } void CDevice::ScheduleTextureToDestroy(const DeviceObjectUID uid) { m_TextureToDestroyQueue.push({m_FrameID, uid}); } void CDevice::SetObjectName(VkObjectType type, const uint64_t handle, const char* name) { if (!m_Capabilities.debugLabels) return; VkDebugUtilsObjectNameInfoEXT nameInfo{}; nameInfo.sType = VK_STRUCTURE_TYPE_DEBUG_UTILS_OBJECT_NAME_INFO_EXT; nameInfo.objectType = type; nameInfo.objectHandle = handle; nameInfo.pObjectName = name; vkSetDebugUtilsObjectNameEXT(m_Device, &nameInfo); } std::unique_ptr CDevice::CreateRingCommandContext(const size_t size) { return std::make_unique( this, size, m_GraphicsQueueFamilyIndex, *m_SubmitScheduler); } void CDevice::RecreateSwapChain() { m_BackbufferReadbackTexture.reset(); int surfaceDrawableWidth = 0, surfaceDrawableHeight = 0; SDL_Vulkan_GetDrawableSize(m_Window, &surfaceDrawableWidth, &surfaceDrawableHeight); m_SwapChain = CSwapChain::Create( this, m_Surface, surfaceDrawableWidth, surfaceDrawableHeight, std::move(m_SwapChain)); } bool CDevice::IsSwapChainValid() { return m_SwapChain && m_SwapChain->IsValid(); } void CDevice::ProcessObjectToDestroyQueue(const bool ignoreFrameID) { while (!m_ObjectToDestroyQueue.empty() && (ignoreFrameID || m_ObjectToDestroyQueue.front().frameID + NUMBER_OF_FRAMES_IN_FLIGHT < m_FrameID)) { ObjectToDestroy& object = m_ObjectToDestroyQueue.front(); #if VK_USE_64_BIT_PTR_DEFINES void* handle = reinterpret_cast(object.handle); #else const uint64_t handle = object.handle; #endif switch (object.type) { case VK_OBJECT_TYPE_IMAGE: vmaDestroyImage(GetVMAAllocator(), static_cast(handle), object.allocation); break; case VK_OBJECT_TYPE_BUFFER: vmaDestroyBuffer(GetVMAAllocator(), static_cast(handle), object.allocation); break; case VK_OBJECT_TYPE_IMAGE_VIEW: vkDestroyImageView(m_Device, static_cast(handle), nullptr); break; case VK_OBJECT_TYPE_BUFFER_VIEW: vkDestroyBufferView(m_Device, static_cast(handle), nullptr); break; case VK_OBJECT_TYPE_FRAMEBUFFER: vkDestroyFramebuffer(m_Device, static_cast(handle), nullptr); break; case VK_OBJECT_TYPE_RENDER_PASS: vkDestroyRenderPass(m_Device, static_cast(handle), nullptr); break; case VK_OBJECT_TYPE_SAMPLER: vkDestroySampler(m_Device, static_cast(handle), nullptr); break; case VK_OBJECT_TYPE_SHADER_MODULE: vkDestroyShaderModule(m_Device, static_cast(handle), nullptr); break; case VK_OBJECT_TYPE_PIPELINE_LAYOUT: vkDestroyPipelineLayout(m_Device, static_cast(handle), nullptr); break; case VK_OBJECT_TYPE_PIPELINE: vkDestroyPipeline(m_Device, static_cast(handle), nullptr); break; default: debug_warn("Unsupported object to destroy type."); } m_ObjectToDestroyQueue.pop(); } } void CDevice::ProcessTextureToDestroyQueue(const bool ignoreFrameID) { while (!m_TextureToDestroyQueue.empty() && (ignoreFrameID || m_TextureToDestroyQueue.front().first + NUMBER_OF_FRAMES_IN_FLIGHT < m_FrameID)) { GetDescriptorManager().OnTextureDestroy(m_TextureToDestroyQueue.front().second); m_TextureToDestroyQueue.pop(); } } CTexture* CDevice::GetCurrentBackbufferTexture() { return IsSwapChainValid() ? m_SwapChain->GetCurrentBackbufferTexture() : nullptr; } CTexture* CDevice::GetOrCreateBackbufferReadbackTexture() { if (!IsSwapChainValid()) return nullptr; if (!m_BackbufferReadbackTexture) { CTexture* currentBackbufferTexture = m_SwapChain->GetCurrentBackbufferTexture(); m_BackbufferReadbackTexture = CTexture::CreateReadback( this, "BackbufferReadback", currentBackbufferTexture->GetFormat(), currentBackbufferTexture->GetWidth(), currentBackbufferTexture->GetHeight()); } return m_BackbufferReadbackTexture.get(); } DeviceObjectUID CDevice::GenerateNextDeviceObjectUID() { ENSURE(m_LastAvailableUID < std::numeric_limits::max()); return m_LastAvailableUID++; } std::unique_ptr CreateDevice(SDL_Window* window) { return Vulkan::CDevice::Create(window); } } // namespace Vulkan } // namespace Backend } // namespace Renderer Index: ps/trunk/source/renderer/backend/vulkan/PipelineState.cpp =================================================================== --- ps/trunk/source/renderer/backend/vulkan/PipelineState.cpp (revision 28009) +++ ps/trunk/source/renderer/backend/vulkan/PipelineState.cpp (revision 28010) @@ -1,313 +1,352 @@ -/* Copyright (C) 2023 Wildfire Games. +/* Copyright (C) 2024 Wildfire Games. * This file is part of 0 A.D. * * 0 A.D. is free software: you can redistribute it and/or modify * it under the terms of the GNU General Public License as published by * the Free Software Foundation, either version 2 of the License, or * (at your option) any later version. * * 0 A.D. is distributed in the hope that it will be useful, * but WITHOUT ANY WARRANTY; without even the implied warranty of * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the * GNU General Public License for more details. * * You should have received a copy of the GNU General Public License * along with 0 A.D. If not, see . */ #include "precompiled.h" #include "PipelineState.h" #include "lib/hash.h" #include "ps/CLogger.h" #include "ps/containers/StaticVector.h" #include "renderer/backend/vulkan/Device.h" #include "renderer/backend/vulkan/Framebuffer.h" #include "renderer/backend/vulkan/Mapping.h" #include "renderer/backend/vulkan/ShaderProgram.h" #include "renderer/backend/vulkan/Utilities.h" #include namespace Renderer { namespace Backend { namespace Vulkan { namespace { VkStencilOpState MakeStencilOpState(const SStencilOpState& opState) { VkStencilOpState result{}; result.failOp = Mapping::FromStencilOp(opState.failOp); result.passOp = Mapping::FromStencilOp(opState.passOp); result.depthFailOp = Mapping::FromStencilOp(opState.depthFailOp); result.compareOp = Mapping::FromCompareOp(opState.compareOp); return result; } } // anonymous namespace size_t CGraphicsPipelineState::CacheKeyHash::operator()(const CacheKey& cacheKey) const { size_t seed = 0; hash_combine(seed, cacheKey.vertexInputLayoutUID); hash_combine(seed, cacheKey.framebufferUID); return seed; } bool CGraphicsPipelineState::CacheKeyEqual::operator()(const CacheKey& lhs, const CacheKey& rhs) const { return lhs.vertexInputLayoutUID == rhs.vertexInputLayoutUID && lhs.framebufferUID == rhs.framebufferUID; } // static std::unique_ptr CGraphicsPipelineState::Create( CDevice* device, const SGraphicsPipelineStateDesc& desc) { ENSURE(desc.shaderProgram); std::unique_ptr pipelineState{new CGraphicsPipelineState()}; pipelineState->m_Device = device; pipelineState->m_UID = device->GenerateNextDeviceObjectUID(); pipelineState->m_Desc = desc; return pipelineState; } CGraphicsPipelineState::~CGraphicsPipelineState() { for (const auto& it : m_PipelineMap) { if (it.second != VK_NULL_HANDLE) m_Device->ScheduleObjectToDestroy( VK_OBJECT_TYPE_PIPELINE, it.second, VK_NULL_HANDLE); } } VkPipeline CGraphicsPipelineState::GetOrCreatePipeline( const CVertexInputLayout* vertexInputLayout, CFramebuffer* framebuffer) { CShaderProgram* shaderProgram = m_Desc.shaderProgram->As(); const CacheKey cacheKey = { vertexInputLayout->GetUID(), framebuffer->GetUID() }; auto it = m_PipelineMap.find(cacheKey); if (it != m_PipelineMap.end()) return it->second; PS::StaticVector attributeBindings; PS::StaticVector attributes; const VkPhysicalDeviceLimits& limits = m_Device->GetChoosenPhysicalDevice().properties.limits; const uint32_t maxVertexInputAttributes = limits.maxVertexInputAttributes; const uint32_t maxVertexInputAttributeOffset = limits.maxVertexInputAttributeOffset; for (const SVertexAttributeFormat& vertexAttributeFormat : vertexInputLayout->GetAttributes()) { ENSURE(vertexAttributeFormat.bindingSlot < maxVertexInputAttributes); ENSURE(vertexAttributeFormat.offset < maxVertexInputAttributeOffset); const uint32_t streamLocation = shaderProgram->GetStreamLocation(vertexAttributeFormat.stream); if (streamLocation == std::numeric_limits::max()) continue; auto it = std::find_if(attributeBindings.begin(), attributeBindings.end(), [slot = vertexAttributeFormat.bindingSlot](const VkVertexInputBindingDescription& desc) -> bool { return desc.binding == slot; }); const VkVertexInputBindingDescription desc{ vertexAttributeFormat.bindingSlot, vertexAttributeFormat.stride, vertexAttributeFormat.rate == VertexAttributeRate::PER_INSTANCE ? VK_VERTEX_INPUT_RATE_INSTANCE : VK_VERTEX_INPUT_RATE_VERTEX }; if (it == attributeBindings.end()) attributeBindings.emplace_back(desc); else { // All attribute sharing the same binding slot should have the same description. ENSURE(desc.inputRate == it->inputRate && desc.stride == it->stride); } attributes.push_back({ streamLocation, vertexAttributeFormat.bindingSlot, Mapping::FromFormat(vertexAttributeFormat.format), vertexAttributeFormat.offset }); } VkPipelineVertexInputStateCreateInfo vertexInputCreateInfo{}; vertexInputCreateInfo.sType = VK_STRUCTURE_TYPE_PIPELINE_VERTEX_INPUT_STATE_CREATE_INFO; vertexInputCreateInfo.vertexBindingDescriptionCount = std::size(attributeBindings); vertexInputCreateInfo.pVertexBindingDescriptions = attributeBindings.data(); vertexInputCreateInfo.vertexAttributeDescriptionCount = std::size(attributes); vertexInputCreateInfo.pVertexAttributeDescriptions = attributes.data(); VkPipelineInputAssemblyStateCreateInfo inputAssemblyCreateInfo{}; inputAssemblyCreateInfo.sType = VK_STRUCTURE_TYPE_PIPELINE_INPUT_ASSEMBLY_STATE_CREATE_INFO; inputAssemblyCreateInfo.topology = VK_PRIMITIVE_TOPOLOGY_TRIANGLE_LIST; inputAssemblyCreateInfo.primitiveRestartEnable = VK_FALSE; // We don't need to specify sizes for viewports and scissors as they're in // dynamic state. VkViewport viewport{}; viewport.x = 0.0f; viewport.y = 0.0f; viewport.width = 0.0f; viewport.height = 0.0f; viewport.minDepth = 0.0f; viewport.maxDepth = 1.0f; VkRect2D scissor{}; VkPipelineViewportStateCreateInfo viewportStateCreateInfo{}; viewportStateCreateInfo.sType = VK_STRUCTURE_TYPE_PIPELINE_VIEWPORT_STATE_CREATE_INFO; viewportStateCreateInfo.viewportCount = 1; viewportStateCreateInfo.pViewports = &viewport; viewportStateCreateInfo.scissorCount = 1; viewportStateCreateInfo.pScissors = &scissor; VkPipelineDepthStencilStateCreateInfo depthStencilStateCreateInfo{}; depthStencilStateCreateInfo.sType = VK_STRUCTURE_TYPE_PIPELINE_DEPTH_STENCIL_STATE_CREATE_INFO; depthStencilStateCreateInfo.depthTestEnable = m_Desc.depthStencilState.depthTestEnabled ? VK_TRUE : VK_FALSE; depthStencilStateCreateInfo.depthWriteEnable = m_Desc.depthStencilState.depthWriteEnabled ? VK_TRUE : VK_FALSE; depthStencilStateCreateInfo.depthCompareOp = Mapping::FromCompareOp(m_Desc.depthStencilState.depthCompareOp); depthStencilStateCreateInfo.stencilTestEnable = m_Desc.depthStencilState.stencilTestEnabled ? VK_TRUE : VK_FALSE; depthStencilStateCreateInfo.front = MakeStencilOpState(m_Desc.depthStencilState.stencilFrontFace); depthStencilStateCreateInfo.front.reference = m_Desc.depthStencilState.stencilReference; depthStencilStateCreateInfo.front.compareMask = m_Desc.depthStencilState.stencilReadMask; depthStencilStateCreateInfo.front.writeMask = m_Desc.depthStencilState.stencilWriteMask; depthStencilStateCreateInfo.back = MakeStencilOpState(m_Desc.depthStencilState.stencilBackFace); depthStencilStateCreateInfo.back.reference = m_Desc.depthStencilState.stencilReference; depthStencilStateCreateInfo.back.compareMask = m_Desc.depthStencilState.stencilReadMask; depthStencilStateCreateInfo.back.writeMask = m_Desc.depthStencilState.stencilWriteMask; VkPipelineRasterizationStateCreateInfo rasterizationStateCreateInfo{}; rasterizationStateCreateInfo.sType = VK_STRUCTURE_TYPE_PIPELINE_RASTERIZATION_STATE_CREATE_INFO; rasterizationStateCreateInfo.depthClampEnable = VK_FALSE; rasterizationStateCreateInfo.rasterizerDiscardEnable = VK_FALSE; const PolygonMode polygonMode = m_Device->GetChoosenPhysicalDevice().features.fillModeNonSolid ? m_Desc.rasterizationState.polygonMode : PolygonMode::FILL; rasterizationStateCreateInfo.polygonMode = Mapping::FromPolygonMode(polygonMode); rasterizationStateCreateInfo.cullMode = Mapping::FromCullMode(m_Desc.rasterizationState.cullMode); rasterizationStateCreateInfo.frontFace = m_Desc.rasterizationState.frontFace == FrontFace::CLOCKWISE ? VK_FRONT_FACE_CLOCKWISE : VK_FRONT_FACE_COUNTER_CLOCKWISE; rasterizationStateCreateInfo.depthBiasEnable = m_Desc.rasterizationState.depthBiasEnabled ? VK_TRUE : VK_FALSE; rasterizationStateCreateInfo.depthBiasConstantFactor = m_Desc.rasterizationState.depthBiasConstantFactor; rasterizationStateCreateInfo.depthBiasSlopeFactor = m_Desc.rasterizationState.depthBiasSlopeFactor; rasterizationStateCreateInfo.lineWidth = 1.0f; VkPipelineMultisampleStateCreateInfo multisampleStateCreateInfo{}; multisampleStateCreateInfo.sType = VK_STRUCTURE_TYPE_PIPELINE_MULTISAMPLE_STATE_CREATE_INFO; multisampleStateCreateInfo.rasterizationSamples = Mapping::FromSampleCount(framebuffer->GetSampleCount()); multisampleStateCreateInfo.minSampleShading = 1.0f; VkPipelineColorBlendAttachmentState colorBlendAttachmentState{}; colorBlendAttachmentState.blendEnable = m_Desc.blendState.enabled ? VK_TRUE : VK_FALSE; colorBlendAttachmentState.colorBlendOp = Mapping::FromBlendOp(m_Desc.blendState.colorBlendOp); colorBlendAttachmentState.srcColorBlendFactor = Mapping::FromBlendFactor(m_Desc.blendState.srcColorBlendFactor); colorBlendAttachmentState.dstColorBlendFactor = Mapping::FromBlendFactor(m_Desc.blendState.dstColorBlendFactor); colorBlendAttachmentState.alphaBlendOp = Mapping::FromBlendOp(m_Desc.blendState.alphaBlendOp); colorBlendAttachmentState.srcAlphaBlendFactor = Mapping::FromBlendFactor(m_Desc.blendState.srcAlphaBlendFactor); colorBlendAttachmentState.dstAlphaBlendFactor = Mapping::FromBlendFactor(m_Desc.blendState.dstAlphaBlendFactor); colorBlendAttachmentState.colorWriteMask = Mapping::FromColorWriteMask(m_Desc.blendState.colorWriteMask); VkPipelineColorBlendStateCreateInfo colorBlendStateCreateInfo{}; colorBlendStateCreateInfo.sType = VK_STRUCTURE_TYPE_PIPELINE_COLOR_BLEND_STATE_CREATE_INFO; colorBlendStateCreateInfo.logicOpEnable = VK_FALSE; colorBlendStateCreateInfo.logicOp = VK_LOGIC_OP_CLEAR; colorBlendStateCreateInfo.attachmentCount = 1; colorBlendStateCreateInfo.pAttachments = &colorBlendAttachmentState; colorBlendStateCreateInfo.blendConstants[0] = m_Desc.blendState.constant.r; colorBlendStateCreateInfo.blendConstants[1] = m_Desc.blendState.constant.g; colorBlendStateCreateInfo.blendConstants[2] = m_Desc.blendState.constant.b; colorBlendStateCreateInfo.blendConstants[3] = m_Desc.blendState.constant.a; const VkDynamicState dynamicStates[] = { VK_DYNAMIC_STATE_SCISSOR, VK_DYNAMIC_STATE_VIEWPORT }; VkPipelineDynamicStateCreateInfo dynamicStateCreateInfo{}; dynamicStateCreateInfo.sType = VK_STRUCTURE_TYPE_PIPELINE_DYNAMIC_STATE_CREATE_INFO; dynamicStateCreateInfo.dynamicStateCount = static_cast(std::size(dynamicStates)); dynamicStateCreateInfo.pDynamicStates = dynamicStates; VkGraphicsPipelineCreateInfo pipelineCreateInfo{}; pipelineCreateInfo.sType = VK_STRUCTURE_TYPE_GRAPHICS_PIPELINE_CREATE_INFO; pipelineCreateInfo.stageCount = shaderProgram->GetStages().size(); pipelineCreateInfo.pStages = shaderProgram->GetStages().data(); pipelineCreateInfo.pVertexInputState = &vertexInputCreateInfo; pipelineCreateInfo.pInputAssemblyState = &inputAssemblyCreateInfo; pipelineCreateInfo.pViewportState = &viewportStateCreateInfo; pipelineCreateInfo.pRasterizationState = &rasterizationStateCreateInfo; pipelineCreateInfo.pMultisampleState = &multisampleStateCreateInfo; // If renderPass is not VK_NULL_HANDLE, the pipeline is being created with // fragment shader state, and subpass uses a depth/stencil attachment, // pDepthStencilState must be a not null pointer. if (framebuffer->GetDepthStencilAttachment()) pipelineCreateInfo.pDepthStencilState = &depthStencilStateCreateInfo; if (!framebuffer->GetColorAttachments().empty()) pipelineCreateInfo.pColorBlendState = &colorBlendStateCreateInfo; pipelineCreateInfo.pDynamicState = &dynamicStateCreateInfo; pipelineCreateInfo.layout = shaderProgram->GetPipelineLayout(); pipelineCreateInfo.renderPass = framebuffer->GetRenderPass(); pipelineCreateInfo.subpass = 0; pipelineCreateInfo.basePipelineHandle = VK_NULL_HANDLE; pipelineCreateInfo.basePipelineIndex = -1; VkPipeline pipeline = VK_NULL_HANDLE; ENSURE_VK_SUCCESS(vkCreateGraphicsPipelines( m_Device->GetVkDevice(), VK_NULL_HANDLE, 1, &pipelineCreateInfo, nullptr, &pipeline)); m_PipelineMap[cacheKey] = pipeline; return pipeline; } IDevice* CGraphicsPipelineState::GetDevice() { return m_Device; } +// static +std::unique_ptr CComputePipelineState::Create( + CDevice* device, const SComputePipelineStateDesc& desc) +{ + ENSURE(desc.shaderProgram); + CShaderProgram* shaderProgram = desc.shaderProgram->As(); + if (shaderProgram->GetStages().empty()) + return nullptr; + + std::unique_ptr pipelineState{new CComputePipelineState()}; + pipelineState->m_Device = device; + pipelineState->m_UID = device->GenerateNextDeviceObjectUID(); + pipelineState->m_Desc = desc; + + VkComputePipelineCreateInfo pipelineCreateInfo{}; + pipelineCreateInfo.sType = VK_STRUCTURE_TYPE_COMPUTE_PIPELINE_CREATE_INFO; + + pipelineCreateInfo.layout = shaderProgram->GetPipelineLayout(); + pipelineCreateInfo.basePipelineHandle = VK_NULL_HANDLE; + pipelineCreateInfo.basePipelineIndex = -1; + pipelineCreateInfo.stage = shaderProgram->GetStages()[0]; + + ENSURE_VK_SUCCESS(vkCreateComputePipelines( + device->GetVkDevice(), VK_NULL_HANDLE, 1, &pipelineCreateInfo, nullptr, &pipelineState->m_Pipeline)); + return pipelineState; +} + +CComputePipelineState::~CComputePipelineState() +{ + if (m_Pipeline != VK_NULL_HANDLE) + m_Device->ScheduleObjectToDestroy( + VK_OBJECT_TYPE_PIPELINE, m_Pipeline, VK_NULL_HANDLE); +} + +IDevice* CComputePipelineState::GetDevice() +{ + return m_Device; +} + } // namespace Vulkan } // namespace Backend } // namespace Renderer Index: ps/trunk/source/renderer/backend/vulkan/DescriptorManager.h =================================================================== --- ps/trunk/source/renderer/backend/vulkan/DescriptorManager.h (revision 28009) +++ ps/trunk/source/renderer/backend/vulkan/DescriptorManager.h (revision 28010) @@ -1,214 +1,216 @@ -/* Copyright (C) 2023 Wildfire Games. +/* Copyright (C) 2024 Wildfire Games. * This file is part of 0 A.D. * * 0 A.D. is free software: you can redistribute it and/or modify * it under the terms of the GNU General Public License as published by * the Free Software Foundation, either version 2 of the License, or * (at your option) any later version. * * 0 A.D. is distributed in the hope that it will be useful, * but WITHOUT ANY WARRANTY; without even the implied warranty of * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the * GNU General Public License for more details. * * You should have received a copy of the GNU General Public License * along with 0 A.D. If not, see . */ #ifndef INCLUDED_RENDERER_BACKEND_VULKAN_DESCRIPTORMANAGER #define INCLUDED_RENDERER_BACKEND_VULKAN_DESCRIPTORMANAGER #include "ps/CStrIntern.h" #include "renderer/backend/Sampler.h" #include "renderer/backend/vulkan/Device.h" #include "renderer/backend/vulkan/Texture.h" #include #include #include #include #include #include namespace Renderer { namespace Backend { namespace Vulkan { class CDevice; class CDescriptorManager { public: CDescriptorManager(CDevice* device, const bool useDescriptorIndexing); ~CDescriptorManager(); bool UseDescriptorIndexing() const { return m_UseDescriptorIndexing; } /** * @return a single type descriptor set layout with the number of bindings * equals to the size. The returned layout is owned by the manager. */ VkDescriptorSetLayout GetSingleTypeDescritorSetLayout( VkDescriptorType type, const uint32_t size); VkDescriptorSet GetSingleTypeDescritorSet( VkDescriptorType type, VkDescriptorSetLayout layout, const std::vector& texturesUID, const std::vector& textures); uint32_t GetUniformSet() const; uint32_t GetTextureDescriptor(CTexture* texture); void OnTextureDestroy(const DeviceObjectUID uid); const VkDescriptorSetLayout& GetDescriptorIndexingSetLayout() const { return m_DescriptorIndexingSetLayout; } const VkDescriptorSetLayout& GetUniformDescriptorSetLayout() const { return m_UniformDescriptorSetLayout; } const VkDescriptorSet& GetDescriptorIndexingSet() { return m_DescriptorIndexingSet; } const std::vector& GetDescriptorSetLayouts() const { return m_DescriptorSetLayouts; } private: struct SingleTypePool { VkDescriptorSetLayout layout; VkDescriptorPool pool; int16_t firstFreeIndex = 0; static constexpr int16_t INVALID_INDEX = -1; struct Element { VkDescriptorSet set = VK_NULL_HANDLE; uint32_t version = 0; int16_t nextFreeIndex = INVALID_INDEX; }; std::vector elements; }; SingleTypePool& GetSingleTypePool(const VkDescriptorType type, const uint32_t size); std::pair GetSingleTypeDescritorSetImpl( VkDescriptorType type, VkDescriptorSetLayout layout, const std::vector& uids); CDevice* m_Device = nullptr; bool m_UseDescriptorIndexing = false; VkDescriptorPool m_DescriptorIndexingPool = VK_NULL_HANDLE; VkDescriptorSet m_DescriptorIndexingSet = VK_NULL_HANDLE; VkDescriptorSetLayout m_DescriptorIndexingSetLayout = VK_NULL_HANDLE; VkDescriptorSetLayout m_UniformDescriptorSetLayout = VK_NULL_HANDLE; std::vector m_DescriptorSetLayouts; static constexpr uint32_t DESCRIPTOR_INDEXING_BINDING_SIZE = 16384; static constexpr uint32_t NUMBER_OF_BINDINGS_PER_DESCRIPTOR_INDEXING_SET = 3; struct DescriptorIndexingBindingMap { static_assert(std::numeric_limits::max() >= DESCRIPTOR_INDEXING_BINDING_SIZE); int16_t firstFreeIndex = 0; std::vector elements; std::unordered_map map; }; std::array m_DescriptorIndexingBindings; std::unordered_map m_TextureToBindingMap; std::unordered_map> m_SingleTypePools; struct SingleTypePoolReference { VkDescriptorType type = VK_DESCRIPTOR_TYPE_MAX_ENUM; uint32_t version = 0; int16_t elementIndex = SingleTypePool::INVALID_INDEX; uint8_t size = 0; }; std::unordered_map> m_UIDToSingleTypePoolMap; using SingleTypeCacheKey = std::pair>; struct SingleTypeCacheKeyHash { size_t operator()(const SingleTypeCacheKey& key) const; }; std::unordered_map m_SingleTypeSets; std::unique_ptr m_ErrorTexture; }; // TODO: ideally we might want to separate a set and its mapping. template class CSingleTypeDescriptorSetBinding { public: CSingleTypeDescriptorSetBinding(CDevice* device, const VkDescriptorType type, const uint32_t size, std::unordered_map mapping) : m_Device{device}, m_Type{type}, m_Mapping{std::move(mapping)} { m_BoundDeviceObjects.resize(size); m_BoundUIDs.resize(size); m_DescriptorSetLayout = m_Device->GetDescriptorManager().GetSingleTypeDescritorSetLayout(m_Type, size); } int32_t GetBindingSlot(const CStrIntern name) const { const auto it = m_Mapping.find(name); return it != m_Mapping.end() ? it->second : -1; } void SetObject(const int32_t bindingSlot, DeviceObject* object) { if (m_BoundUIDs[bindingSlot] == object->GetUID()) return; m_BoundUIDs[bindingSlot] = object->GetUID(); m_BoundDeviceObjects[bindingSlot] = object; m_Outdated = true; } bool IsOutdated() const { return m_Outdated; } VkDescriptorSet UpdateAndReturnDescriptorSet() { ENSURE(m_Outdated); m_Outdated = false; VkDescriptorSet descriptorSet = m_Device->GetDescriptorManager().GetSingleTypeDescritorSet( m_Type, m_DescriptorSetLayout, m_BoundUIDs, m_BoundDeviceObjects); ENSURE(descriptorSet != VK_NULL_HANDLE); return descriptorSet; } void Unbind() { std::fill(m_BoundDeviceObjects.begin(), m_BoundDeviceObjects.end(), nullptr); std::fill(m_BoundUIDs.begin(), m_BoundUIDs.end(), INVALID_DEVICE_OBJECT_UID); m_Outdated = true; } VkDescriptorSetLayout GetDescriptorSetLayout() { return m_DescriptorSetLayout; } + const std::vector& GetBoundDeviceObjects() const { return m_BoundDeviceObjects; } + private: CDevice* const m_Device; const VkDescriptorType m_Type; const std::unordered_map m_Mapping; bool m_Outdated{true}; VkDescriptorSetLayout m_DescriptorSetLayout{VK_NULL_HANDLE}; std::vector m_BoundDeviceObjects; std::vector m_BoundUIDs; }; } // namespace Vulkan } // namespace Backend } // namespace Renderer #endif // INCLUDED_RENDERER_BACKEND_VULKAN_DESCRIPTORMANAGER Index: ps/trunk/source/renderer/backend/vulkan/DeviceCommandContext.h =================================================================== --- ps/trunk/source/renderer/backend/vulkan/DeviceCommandContext.h (revision 28009) +++ ps/trunk/source/renderer/backend/vulkan/DeviceCommandContext.h (revision 28010) @@ -1,199 +1,212 @@ -/* Copyright (C) 2023 Wildfire Games. +/* Copyright (C) 2024 Wildfire Games. * This file is part of 0 A.D. * * 0 A.D. is free software: you can redistribute it and/or modify * it under the terms of the GNU General Public License as published by * the Free Software Foundation, either version 2 of the License, or * (at your option) any later version. * * 0 A.D. is distributed in the hope that it will be useful, * but WITHOUT ANY WARRANTY; without even the implied warranty of * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the * GNU General Public License for more details. * * You should have received a copy of the GNU General Public License * along with 0 A.D. If not, see . */ #ifndef INCLUDED_RENDERER_VULKAN_DEVICECOMMANDCONTEXT #define INCLUDED_RENDERER_VULKAN_DEVICECOMMANDCONTEXT #include "ps/containers/StaticVector.h" #include "renderer/backend/IBuffer.h" #include "renderer/backend/IDeviceCommandContext.h" #include #include #include namespace Renderer { namespace Backend { namespace Vulkan { class CBuffer; class CDevice; class CFramebuffer; class CGraphicsPipelineState; class CRingCommandContext; class CShaderProgram; class CVertexInputLayout; class CDeviceCommandContext final : public IDeviceCommandContext { public: ~CDeviceCommandContext() override; IDevice* GetDevice() override; void SetGraphicsPipelineState(IGraphicsPipelineState* pipelineState) override; + void SetComputePipelineState(IComputePipelineState* pipelineState) override; void BlitFramebuffer( IFramebuffer* destinationFramebuffer, IFramebuffer* sourceFramebuffer, const Rect& destinationRegion, const Rect& sourceRegion, const Sampler::Filter filter) override; void ResolveFramebuffer( IFramebuffer* destinationFramebuffer, IFramebuffer* sourceFramebuffer) override; void ClearFramebuffer(const bool color, const bool depth, const bool stencil) override; void BeginFramebufferPass(IFramebuffer* framebuffer) override; void EndFramebufferPass() override; void ReadbackFramebufferSync( const uint32_t x, const uint32_t y, const uint32_t width, const uint32_t height, void* data) override; void UploadTexture(ITexture* texture, const Format dataFormat, const void* data, const size_t dataSize, const uint32_t level = 0, const uint32_t layer = 0) override; void UploadTextureRegion(ITexture* texture, const Format dataFormat, const void* data, const size_t dataSize, const uint32_t xOffset, const uint32_t yOffset, const uint32_t width, const uint32_t height, const uint32_t level = 0, const uint32_t layer = 0) override; void UploadBuffer(IBuffer* buffer, const void* data, const uint32_t dataSize) override; void UploadBuffer(IBuffer* buffer, const UploadBufferFunction& uploadFunction) override; void UploadBufferRegion( IBuffer* buffer, const void* data, const uint32_t dataOffset, const uint32_t dataSize) override; void UploadBufferRegion( IBuffer* buffer, const uint32_t dataOffset, const uint32_t dataSize, const UploadBufferFunction& uploadFunction) override; void SetScissors(const uint32_t scissorCount, const Rect* scissors) override; void SetViewports(const uint32_t viewportCount, const Rect* viewports) override; void SetVertexInputLayout( IVertexInputLayout* vertexInputLayout) override; void SetVertexBuffer( const uint32_t bindingSlot, IBuffer* buffer, const uint32_t offset) override; void SetVertexBufferData( const uint32_t bindingSlot, const void* data, const uint32_t dataSize) override; void SetIndexBuffer(IBuffer* buffer) override; void SetIndexBufferData(const void* data, const uint32_t dataSize) override; void BeginPass() override; void EndPass() override; void Draw(const uint32_t firstVertex, const uint32_t vertexCount) override; void DrawIndexed( const uint32_t firstIndex, const uint32_t indexCount, const int32_t vertexOffset) override; void DrawInstanced( const uint32_t firstVertex, const uint32_t vertexCount, const uint32_t firstInstance, const uint32_t instanceCount) override; void DrawIndexedInstanced( const uint32_t firstIndex, const uint32_t indexCount, const uint32_t firstInstance, const uint32_t instanceCount, const int32_t vertexOffset) override; void DrawIndexedInRange( const uint32_t firstIndex, const uint32_t indexCount, const uint32_t start, const uint32_t end) override; + void BeginComputePass() override; + void EndComputePass() override; + + void Dispatch( + const uint32_t groupCountX, + const uint32_t groupCountY, + const uint32_t groupCountZ) override; + void SetTexture(const int32_t bindingSlot, ITexture* texture) override; + void SetStorageTexture(const int32_t bindingSlot, ITexture* texture) override; + void SetUniform( const int32_t bindingSlot, const float value) override; void SetUniform( const int32_t bindingSlot, const float valueX, const float valueY) override; void SetUniform( const int32_t bindingSlot, const float valueX, const float valueY, const float valueZ) override; void SetUniform( const int32_t bindingSlot, const float valueX, const float valueY, const float valueZ, const float valueW) override; void SetUniform( const int32_t bindingSlot, PS::span values) override; void BeginScopedLabel(const char* name) override; void EndScopedLabel() override; void Flush() override; private: friend class CDevice; static std::unique_ptr Create(CDevice* device); CDeviceCommandContext(); void PreDraw(); + void UpdateOutdatedConstants(); void ApplyPipelineStateIfDirty(); void BindVertexBuffer(const uint32_t bindingSlot, CBuffer* buffer, uint32_t offset); void BindIndexBuffer(CBuffer* buffer, uint32_t offset); CDevice* m_Device = nullptr; bool m_DebugScopedLabels = false; std::unique_ptr m_PrependCommandContext; std::unique_ptr m_CommandContext; CGraphicsPipelineState* m_GraphicsPipelineState = nullptr; CVertexInputLayout* m_VertexInputLayout = nullptr; CFramebuffer* m_Framebuffer = nullptr; CShaderProgram* m_ShaderProgram = nullptr; bool m_IsPipelineStateDirty = true; VkPipeline m_LastBoundPipeline = VK_NULL_HANDLE; bool m_InsideFramebufferPass = false; bool m_InsidePass = false; + bool m_InsideComputePass = false; // Currently bound buffers to skip the same buffer bind. CBuffer* m_BoundIndexBuffer = nullptr; uint32_t m_BoundIndexBufferOffset = 0; class CUploadRing; std::unique_ptr m_VertexUploadRing, m_IndexUploadRing, m_UniformUploadRing; VkDescriptorPool m_UniformDescriptorPool = VK_NULL_HANDLE; VkDescriptorSet m_UniformDescriptorSet = VK_NULL_HANDLE; // Currently we support readbacks only from backbuffer. struct QueuedReadback { uint32_t x = 0, y = 0; uint32_t width = 0, height = 0; // It's a responsibility of the caller to guarantee that data is valid. void* data = nullptr; }; PS::StaticVector m_QueuedReadbacks; bool m_DebugBarrierAfterFramebufferPass = false; }; } // namespace Vulkan } // namespace Backend } // namespace Renderer #endif // INCLUDED_RENDERER_VULKAN_DEVICECOMMANDCONTEXT Index: ps/trunk/source/renderer/backend/vulkan/ShaderProgram.cpp =================================================================== --- ps/trunk/source/renderer/backend/vulkan/ShaderProgram.cpp (revision 28009) +++ ps/trunk/source/renderer/backend/vulkan/ShaderProgram.cpp (revision 28010) @@ -1,694 +1,836 @@ -/* Copyright (C) 2023 Wildfire Games. +/* Copyright (C) 2024 Wildfire Games. * This file is part of 0 A.D. * * 0 A.D. is free software: you can redistribute it and/or modify * it under the terms of the GNU General Public License as published by * the Free Software Foundation, either version 2 of the License, or * (at your option) any later version. * * 0 A.D. is distributed in the hope that it will be useful, * but WITHOUT ANY WARRANTY; without even the implied warranty of * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the * GNU General Public License for more details. * * You should have received a copy of the GNU General Public License * along with 0 A.D. If not, see . */ #include "precompiled.h" #include "ShaderProgram.h" #include "graphics/ShaderDefines.h" #include "ps/CLogger.h" #include "ps/containers/StaticVector.h" #include "ps/CStr.h" #include "ps/CStrInternStatic.h" #include "ps/Filesystem.h" #include "ps/Profile.h" #include "ps/XML/Xeromyces.h" #include "renderer/backend/vulkan/DescriptorManager.h" #include "renderer/backend/vulkan/Device.h" #include "renderer/backend/vulkan/RingCommandContext.h" #include "renderer/backend/vulkan/Texture.h" #include "renderer/backend/vulkan/Utilities.h" #include #include namespace Renderer { namespace Backend { namespace Vulkan { namespace { VkShaderModule CreateShaderModule(CDevice* device, const VfsPath& path) { CVFSFile file; if (file.Load(g_VFS, path) != PSRETURN_OK) { LOGERROR("Failed to load shader file: '%s'", path.string8()); return VK_NULL_HANDLE; } VkShaderModuleCreateInfo createInfo{}; createInfo.sType = VK_STRUCTURE_TYPE_SHADER_MODULE_CREATE_INFO; // Casting to uint32_t requires to fit alignment and size. ENSURE(file.GetBufferSize() % 4 == 0); ENSURE(reinterpret_cast(file.GetBuffer()) % alignof(uint32_t) == 0u); createInfo.codeSize = file.GetBufferSize(); createInfo.pCode = reinterpret_cast(file.GetBuffer()); VkShaderModule shaderModule; const VkResult result = vkCreateShaderModule(device->GetVkDevice(), &createInfo, nullptr, &shaderModule); if (result != VK_SUCCESS) { LOGERROR("Failed to create shader module from file: '%s' %d (%s)", path.string8(), static_cast(result), Utilities::GetVkResultName(result)); return VK_NULL_HANDLE; } device->SetObjectName(VK_OBJECT_TYPE_SHADER_MODULE, shaderModule, path.string8().c_str()); return shaderModule; } VfsPath FindProgramMatchingDefines(const VfsPath& xmlFilename, const CShaderDefines& defines) { CXeromyces xeroFile; PSRETURN ret = xeroFile.Load(g_VFS, xmlFilename); if (ret != PSRETURN_OK) return {}; // TODO: add XML validation. #define EL(x) const int el_##x = xeroFile.GetElementID(#x) #define AT(x) const int at_##x = xeroFile.GetAttributeID(#x) EL(define); EL(defines); EL(program); AT(file); AT(name); AT(value); #undef AT #undef EL const CStrIntern strUndefined("UNDEFINED"); VfsPath programFilename; XMBElement root = xeroFile.GetRoot(); XERO_ITER_EL(root, rootChild) { if (rootChild.GetNodeName() == el_program) { CShaderDefines programDefines; XERO_ITER_EL(rootChild, programChild) { if (programChild.GetNodeName() == el_defines) { XERO_ITER_EL(programChild, definesChild) { XMBAttributeList attributes = definesChild.GetAttributes(); if (definesChild.GetNodeName() == el_define) { const CStrIntern value(attributes.GetNamedItem(at_value)); if (value == strUndefined) continue; programDefines.Add( CStrIntern(attributes.GetNamedItem(at_name)), value); } } } } if (programDefines == defines) return L"shaders/" + rootChild.GetAttributes().GetNamedItem(at_file).FromUTF8(); } } return {}; } } // anonymous namespace IDevice* CVertexInputLayout::GetDevice() { return m_Device; } // static std::unique_ptr CShaderProgram::Create( CDevice* device, const CStr& name, const CShaderDefines& baseDefines) { const VfsPath xmlFilename = L"shaders/" + wstring_from_utf8(name) + L".xml"; std::unique_ptr shaderProgram(new CShaderProgram()); shaderProgram->m_Device = device; shaderProgram->m_FileDependencies = {xmlFilename}; CShaderDefines defines = baseDefines; if (device->GetDescriptorManager().UseDescriptorIndexing()) defines.Add(str_USE_DESCRIPTOR_INDEXING, str_1); const VfsPath programFilename = FindProgramMatchingDefines(xmlFilename, defines); if (programFilename.empty()) { LOGERROR("Program '%s' with required defines not found.", name); for (const auto& pair : defines.GetMap()) LOGERROR(" \"%s\": \"%s\"", pair.first.c_str(), pair.second.c_str()); return nullptr; } shaderProgram->m_FileDependencies.emplace_back(programFilename); CXeromyces programXeroFile; if (programXeroFile.Load(g_VFS, programFilename) != PSRETURN_OK) return nullptr; XMBElement programRoot = programXeroFile.GetRoot(); #define EL(x) const int el_##x = programXeroFile.GetElementID(#x) #define AT(x) const int at_##x = programXeroFile.GetAttributeID(#x) EL(binding); + EL(compute); EL(descriptor_set); EL(descriptor_sets); EL(fragment); EL(member); EL(push_constant); EL(stream); EL(vertex); AT(binding); AT(file); AT(location); AT(name); AT(offset); AT(set); AT(size); AT(type); #undef AT #undef EL auto addPushConstant = [&pushConstants=shaderProgram->m_PushConstants, &pushConstantDataFlags=shaderProgram->m_PushConstantDataFlags, &at_name, &at_offset, &at_size]( const XMBElement& element, VkShaderStageFlags stageFlags) -> bool { const XMBAttributeList attributes = element.GetAttributes(); const CStrIntern name = CStrIntern(attributes.GetNamedItem(at_name)); const uint32_t size = attributes.GetNamedItem(at_size).ToUInt(); const uint32_t offset = attributes.GetNamedItem(at_offset).ToUInt(); if (offset % 4 != 0 || size % 4 != 0) { LOGERROR("Push constant should have offset and size be multiple of 4."); return false; } for (PushConstant& pushConstant : pushConstants) { if (pushConstant.name == name) { if (size != pushConstant.size || offset != pushConstant.offset) { LOGERROR("All shared push constants must have the same size and offset."); return false; } // We found the same constant so we don't need to add it again. pushConstant.stageFlags |= stageFlags; for (uint32_t index = 0; index < (size >> 2); ++index) pushConstantDataFlags[(offset >> 2) + index] |= stageFlags; return true; } if (offset + size < pushConstant.offset || offset >= pushConstant.offset + pushConstant.size) continue; LOGERROR("All push constant must not intersect each other in memory."); return false; } pushConstants.push_back({name, offset, size, stageFlags}); for (uint32_t index = 0; index < (size >> 2); ++index) pushConstantDataFlags[(offset >> 2) + index] = stageFlags; return true; }; uint32_t texturesDescriptorSetSize = 0; std::unordered_map textureMapping; + VkDescriptorType storageImageDescriptorType = VK_DESCRIPTOR_TYPE_MAX_ENUM; + uint32_t storageImageDescriptorSetSize = 0; + std::unordered_map storageImageMapping; + auto addDescriptorSets = [&](const XMBElement& element) -> bool { const bool useDescriptorIndexing = device->GetDescriptorManager().UseDescriptorIndexing(); // TODO: reduce the indentation. XERO_ITER_EL(element, descriporSetsChild) { if (descriporSetsChild.GetNodeName() == el_descriptor_set) { const uint32_t set = descriporSetsChild.GetAttributes().GetNamedItem(at_set).ToUInt(); if (useDescriptorIndexing && set == 0 && !descriporSetsChild.GetChildNodes().empty()) { LOGERROR("Descritor set for descriptor indexing shouldn't contain bindings."); return false; } XERO_ITER_EL(descriporSetsChild, descriporSetChild) { if (descriporSetChild.GetNodeName() == el_binding) { const XMBAttributeList attributes = descriporSetChild.GetAttributes(); const uint32_t binding = attributes.GetNamedItem(at_binding).ToUInt(); const uint32_t size = attributes.GetNamedItem(at_size).ToUInt(); const CStr type = attributes.GetNamedItem(at_type); if (type == "uniform") { const uint32_t expectedSet = device->GetDescriptorManager().GetUniformSet(); if (set != expectedSet || binding != 0) { LOGERROR("We support only a single uniform block per shader program."); return false; } shaderProgram->m_MaterialConstantsDataSize = size; XERO_ITER_EL(descriporSetChild, bindingChild) { if (bindingChild.GetNodeName() == el_member) { const XMBAttributeList memberAttributes = bindingChild.GetAttributes(); const uint32_t offset = memberAttributes.GetNamedItem(at_offset).ToUInt(); const uint32_t size = memberAttributes.GetNamedItem(at_size).ToUInt(); const CStrIntern name{memberAttributes.GetNamedItem(at_name)}; bool found = false; for (const Uniform& uniform : shaderProgram->m_Uniforms) { if (uniform.name == name) { if (offset != uniform.offset || size != uniform.size) { LOGERROR("All uniforms across all stage should match."); return false; } found = true; } else { if (offset + size <= uniform.offset || uniform.offset + uniform.size <= offset) continue; LOGERROR("Uniforms must not overlap each other."); return false; } } if (!found) shaderProgram->m_Uniforms.push_back({name, offset, size}); } } } else if (type == "sampler1D" || type == "sampler2D" || type == "sampler2DShadow" || type == "sampler3D" || type == "samplerCube") { if (useDescriptorIndexing) { LOGERROR("We support only uniform descriptor sets with enabled descriptor indexing."); return false; } const CStrIntern name{attributes.GetNamedItem(at_name)}; textureMapping[name] = binding; texturesDescriptorSetSize = std::max(texturesDescriptorSetSize, binding + 1); } + else if (type == "storageImage" || type == "storageBuffer") + { + const CStrIntern name{attributes.GetNamedItem(at_name)}; + storageImageMapping[name] = binding; + storageImageDescriptorSetSize = + std::max(storageImageDescriptorSetSize, binding + 1); + const VkDescriptorType descriptorType = type == "storageBuffer" + ? VK_DESCRIPTOR_TYPE_STORAGE_BUFFER + : VK_DESCRIPTOR_TYPE_STORAGE_IMAGE; + if (storageImageDescriptorType == VK_DESCRIPTOR_TYPE_MAX_ENUM) + storageImageDescriptorType = descriptorType; + else if (storageImageDescriptorType != descriptorType) + { + LOGERROR("Shader should have storages of the same type."); + return false; + } + } else { LOGERROR("Unsupported binding: '%s'", type.c_str()); return false; } } } } } return true; }; XERO_ITER_EL(programRoot, programChild) { if (programChild.GetNodeName() == el_vertex) { + if (shaderProgram->m_PipelineBindPoint != VK_PIPELINE_BIND_POINT_MAX_ENUM && + shaderProgram->m_PipelineBindPoint != VK_PIPELINE_BIND_POINT_GRAPHICS) + { + LOGERROR("Shader program can't mix different pipelines: '%s'.", name.c_str()); + return nullptr; + } + shaderProgram->m_PipelineBindPoint = VK_PIPELINE_BIND_POINT_GRAPHICS; const VfsPath shaderModulePath = L"shaders/" + programChild.GetAttributes().GetNamedItem(at_file).FromUTF8(); shaderProgram->m_FileDependencies.emplace_back(shaderModulePath); shaderProgram->m_ShaderModules.emplace_back( CreateShaderModule(device, shaderModulePath)); if (shaderProgram->m_ShaderModules.back() == VK_NULL_HANDLE) return nullptr; VkPipelineShaderStageCreateInfo vertexShaderStageInfo{}; vertexShaderStageInfo.sType = VK_STRUCTURE_TYPE_PIPELINE_SHADER_STAGE_CREATE_INFO; vertexShaderStageInfo.stage = VK_SHADER_STAGE_VERTEX_BIT; vertexShaderStageInfo.module = shaderProgram->m_ShaderModules.back(); vertexShaderStageInfo.pName = "main"; shaderProgram->m_Stages.emplace_back(std::move(vertexShaderStageInfo)); XERO_ITER_EL(programChild, stageChild) { if (stageChild.GetNodeName() == el_stream) { XMBAttributeList attributes = stageChild.GetAttributes(); const uint32_t location = attributes.GetNamedItem(at_location).ToUInt(); const CStr streamName = attributes.GetNamedItem(at_name); VertexAttributeStream stream = VertexAttributeStream::UV7; if (streamName == "pos") stream = VertexAttributeStream::POSITION; else if (streamName == "normal") stream = VertexAttributeStream::NORMAL; else if (streamName == "color") stream = VertexAttributeStream::COLOR; else if (streamName == "uv0") stream = VertexAttributeStream::UV0; else if (streamName == "uv1") stream = VertexAttributeStream::UV1; else if (streamName == "uv2") stream = VertexAttributeStream::UV2; else if (streamName == "uv3") stream = VertexAttributeStream::UV3; else if (streamName == "uv4") stream = VertexAttributeStream::UV4; else if (streamName == "uv5") stream = VertexAttributeStream::UV5; else if (streamName == "uv6") stream = VertexAttributeStream::UV6; else if (streamName == "uv7") stream = VertexAttributeStream::UV7; else debug_warn("Unknown stream"); shaderProgram->m_StreamLocations[stream] = location; } else if (stageChild.GetNodeName() == el_push_constant) { if (!addPushConstant(stageChild, VK_SHADER_STAGE_VERTEX_BIT)) return nullptr; } else if (stageChild.GetNodeName() == el_descriptor_sets) { if (!addDescriptorSets(stageChild)) return nullptr; } } } else if (programChild.GetNodeName() == el_fragment) { + if (shaderProgram->m_PipelineBindPoint != VK_PIPELINE_BIND_POINT_MAX_ENUM && + shaderProgram->m_PipelineBindPoint != VK_PIPELINE_BIND_POINT_GRAPHICS) + { + LOGERROR("Shader program can't mix different pipelines: '%s'.", name.c_str()); + return nullptr; + } + shaderProgram->m_PipelineBindPoint = VK_PIPELINE_BIND_POINT_GRAPHICS; const VfsPath shaderModulePath = L"shaders/" + programChild.GetAttributes().GetNamedItem(at_file).FromUTF8(); shaderProgram->m_FileDependencies.emplace_back(shaderModulePath); shaderProgram->m_ShaderModules.emplace_back( CreateShaderModule(device, shaderModulePath)); if (shaderProgram->m_ShaderModules.back() == VK_NULL_HANDLE) return nullptr; VkPipelineShaderStageCreateInfo fragmentShaderStageInfo{}; fragmentShaderStageInfo.sType = VK_STRUCTURE_TYPE_PIPELINE_SHADER_STAGE_CREATE_INFO; fragmentShaderStageInfo.stage = VK_SHADER_STAGE_FRAGMENT_BIT; fragmentShaderStageInfo.module = shaderProgram->m_ShaderModules.back(); fragmentShaderStageInfo.pName = "main"; shaderProgram->m_Stages.emplace_back(std::move(fragmentShaderStageInfo)); XERO_ITER_EL(programChild, stageChild) { if (stageChild.GetNodeName() == el_push_constant) { if (!addPushConstant(stageChild, VK_SHADER_STAGE_FRAGMENT_BIT)) return nullptr; } else if (stageChild.GetNodeName() == el_descriptor_sets) { if (!addDescriptorSets(stageChild)) return nullptr; } } } + else if (programChild.GetNodeName() == el_compute) + { + if (shaderProgram->m_PipelineBindPoint != VK_PIPELINE_BIND_POINT_MAX_ENUM && + shaderProgram->m_PipelineBindPoint != VK_PIPELINE_BIND_POINT_COMPUTE) + { + LOGERROR("Shader program can't mix different pipelines: '%s'.", name.c_str()); + return nullptr; + } + shaderProgram->m_PipelineBindPoint = VK_PIPELINE_BIND_POINT_COMPUTE; + const VfsPath shaderModulePath = + L"shaders/" + programChild.GetAttributes().GetNamedItem(at_file).FromUTF8(); + shaderProgram->m_FileDependencies.emplace_back(shaderModulePath); + shaderProgram->m_ShaderModules.emplace_back( + CreateShaderModule(device, shaderModulePath)); + if (shaderProgram->m_ShaderModules.back() == VK_NULL_HANDLE) + return nullptr; + VkPipelineShaderStageCreateInfo computeShaderStageInfo{}; + computeShaderStageInfo.sType = VK_STRUCTURE_TYPE_PIPELINE_SHADER_STAGE_CREATE_INFO; + computeShaderStageInfo.stage = VK_SHADER_STAGE_COMPUTE_BIT; + computeShaderStageInfo.module = shaderProgram->m_ShaderModules.back(); + computeShaderStageInfo.pName = "main"; + shaderProgram->m_Stages.emplace_back(std::move(computeShaderStageInfo)); + XERO_ITER_EL(programChild, stageChild) + { + if (stageChild.GetNodeName() == el_push_constant) + { + if (!addPushConstant(stageChild, VK_SHADER_STAGE_COMPUTE_BIT)) + return nullptr; + } + else if (stageChild.GetNodeName() == el_descriptor_sets) + { + if (!addDescriptorSets(stageChild)) + return nullptr; + } + } + } } if (shaderProgram->m_Stages.empty()) { LOGERROR("Program should contain at least one stage."); return nullptr; } + ENSURE(shaderProgram->m_PipelineBindPoint != VK_PIPELINE_BIND_POINT_MAX_ENUM); + for (size_t index = 0; index < shaderProgram->m_PushConstants.size(); ++index) shaderProgram->m_PushConstantMapping[shaderProgram->m_PushConstants[index].name] = index; std::vector pushConstantRanges; pushConstantRanges.reserve(shaderProgram->m_PushConstants.size()); std::transform( shaderProgram->m_PushConstants.begin(), shaderProgram->m_PushConstants.end(), std::back_insert_iterator(pushConstantRanges), [](const PushConstant& pushConstant) { return VkPushConstantRange{pushConstant.stageFlags, pushConstant.offset, pushConstant.size}; }); if (!pushConstantRanges.empty()) { std::sort(pushConstantRanges.begin(), pushConstantRanges.end(), [](const VkPushConstantRange& lhs, const VkPushConstantRange& rhs) { return lhs.offset < rhs.offset; }); // Merge subsequent constants. auto it = pushConstantRanges.begin(); while (std::next(it) != pushConstantRanges.end()) { auto next = std::next(it); if (it->stageFlags == next->stageFlags) { it->size = next->offset - it->offset + next->size; pushConstantRanges.erase(next); } else it = next; } for (const VkPushConstantRange& range : pushConstantRanges) if (std::count_if(pushConstantRanges.begin(), pushConstantRanges.end(), [stageFlags=range.stageFlags](const VkPushConstantRange& range) { return range.stageFlags & stageFlags; }) != 1) { LOGERROR("Any two range must not include the same stage in stageFlags."); return nullptr; } } for (size_t index = 0; index < shaderProgram->m_Uniforms.size(); ++index) shaderProgram->m_UniformMapping[shaderProgram->m_Uniforms[index].name] = index; if (!shaderProgram->m_Uniforms.empty()) { if (shaderProgram->m_MaterialConstantsDataSize > device->GetChoosenPhysicalDevice().properties.limits.maxUniformBufferRange) { LOGERROR("Uniform buffer size is too big for the device."); return nullptr; } shaderProgram->m_MaterialConstantsData = std::make_unique(shaderProgram->m_MaterialConstantsDataSize); } std::vector layouts = device->GetDescriptorManager().GetDescriptorSetLayouts(); if (texturesDescriptorSetSize > 0) { ENSURE(!device->GetDescriptorManager().UseDescriptorIndexing()); shaderProgram->m_TextureBinding.emplace( device, VK_DESCRIPTOR_TYPE_COMBINED_IMAGE_SAMPLER, texturesDescriptorSetSize, std::move(textureMapping)); layouts.emplace_back(shaderProgram->m_TextureBinding->GetDescriptorSetLayout()); } + if (storageImageDescriptorSetSize > 0) + { + shaderProgram->m_StorageImageBinding.emplace( + device, storageImageDescriptorType, storageImageDescriptorSetSize, std::move(storageImageMapping)); + layouts.emplace_back(shaderProgram->m_StorageImageBinding->GetDescriptorSetLayout()); + } VkPipelineLayoutCreateInfo pipelineLayoutCreateInfo{}; pipelineLayoutCreateInfo.sType = VK_STRUCTURE_TYPE_PIPELINE_LAYOUT_CREATE_INFO; pipelineLayoutCreateInfo.setLayoutCount = layouts.size(); pipelineLayoutCreateInfo.pSetLayouts = layouts.data(); pipelineLayoutCreateInfo.pushConstantRangeCount = pushConstantRanges.size(); pipelineLayoutCreateInfo.pPushConstantRanges = pushConstantRanges.data(); const VkResult result = vkCreatePipelineLayout( device->GetVkDevice(), &pipelineLayoutCreateInfo, nullptr, &shaderProgram->m_PipelineLayout); if (result != VK_SUCCESS) { LOGERROR("Failed to create a pipeline layout: %d (%s)", static_cast(result), Utilities::GetVkResultName(result)); return nullptr; } return shaderProgram; } CShaderProgram::CShaderProgram() = default; CShaderProgram::~CShaderProgram() { if (m_PipelineLayout != VK_NULL_HANDLE) m_Device->ScheduleObjectToDestroy(VK_OBJECT_TYPE_PIPELINE_LAYOUT, m_PipelineLayout, VK_NULL_HANDLE); for (VkShaderModule shaderModule : m_ShaderModules) if (shaderModule != VK_NULL_HANDLE) m_Device->ScheduleObjectToDestroy(VK_OBJECT_TYPE_SHADER_MODULE, shaderModule, VK_NULL_HANDLE); } IDevice* CShaderProgram::GetDevice() { return m_Device; } int32_t CShaderProgram::GetBindingSlot(const CStrIntern name) const { if (auto it = m_PushConstantMapping.find(name); it != m_PushConstantMapping.end()) return it->second; if (auto it = m_UniformMapping.find(name); it != m_UniformMapping.end()) return it->second + m_PushConstants.size(); if (const int32_t bindingSlot = m_TextureBinding.has_value() ? m_TextureBinding->GetBindingSlot(name) : -1; bindingSlot != -1) return bindingSlot + m_PushConstants.size() + m_UniformMapping.size(); + if (const int32_t bindingSlot = m_StorageImageBinding.has_value() ? m_StorageImageBinding->GetBindingSlot(name) : -1; bindingSlot != -1) + return bindingSlot + m_PushConstants.size() + m_UniformMapping.size() + (m_TextureBinding.has_value() ? m_TextureBinding->GetBoundDeviceObjects().size() : 0); return -1; } std::vector CShaderProgram::GetFileDependencies() const { return m_FileDependencies; } uint32_t CShaderProgram::GetStreamLocation(const VertexAttributeStream stream) const { auto it = m_StreamLocations.find(stream); return it != m_StreamLocations.end() ? it->second : std::numeric_limits::max(); } void CShaderProgram::Bind() { if (m_MaterialConstantsData) m_MaterialConstantsDataOutdated = true; } void CShaderProgram::Unbind() { if (m_TextureBinding.has_value()) m_TextureBinding->Unbind(); + if (m_StorageImageBinding.has_value()) + m_StorageImageBinding->Unbind(); } void CShaderProgram::PreDraw(CRingCommandContext& commandContext) { BindOutdatedDescriptorSets(commandContext); if (m_PushConstantDataMask) { for (uint32_t index = 0; index < 32;) { if (!(m_PushConstantDataMask & (1 << index))) { ++index; continue; } uint32_t indexEnd = index + 1; while (indexEnd < 32 && (m_PushConstantDataMask & (1 << indexEnd)) && m_PushConstantDataFlags[index] == m_PushConstantDataFlags[indexEnd]) ++indexEnd; vkCmdPushConstants( commandContext.GetCommandBuffer(), GetPipelineLayout(), m_PushConstantDataFlags[index], index * 4, (indexEnd - index) * 4, m_PushConstantData.data() + index * 4); index = indexEnd; } m_PushConstantDataMask = 0; } } +void CShaderProgram::PreDispatch( + CRingCommandContext& commandContext) +{ + PreDraw(commandContext); + + if (m_StorageImageBinding.has_value()) + for (CTexture* texture : m_StorageImageBinding->GetBoundDeviceObjects()) + if (texture) + { + if (!(texture->GetUsage() & ITexture::Usage::SAMPLED) && texture->IsInitialized()) + continue; + VkImageLayout oldLayout = VK_IMAGE_LAYOUT_SHADER_READ_ONLY_OPTIMAL; + if (!texture->IsInitialized()) + oldLayout = VK_IMAGE_LAYOUT_UNDEFINED; + Utilities::SetTextureLayout( + commandContext.GetCommandBuffer(), texture, + oldLayout, + VK_IMAGE_LAYOUT_GENERAL, + VK_ACCESS_SHADER_READ_BIT, VK_ACCESS_SHADER_READ_BIT | VK_ACCESS_SHADER_WRITE_BIT, + VK_PIPELINE_STAGE_FRAGMENT_SHADER_BIT | VK_PIPELINE_STAGE_COMPUTE_SHADER_BIT, + VK_PIPELINE_STAGE_COMPUTE_SHADER_BIT); + } +} + +void CShaderProgram::PostDispatch(CRingCommandContext& commandContext) +{ + if (m_StorageImageBinding.has_value()) + for (CTexture* texture : m_StorageImageBinding->GetBoundDeviceObjects()) + if (texture) + { + if (!(texture->GetUsage() & ITexture::Usage::SAMPLED) && texture->IsInitialized()) + continue; + Utilities::SetTextureLayout( + commandContext.GetCommandBuffer(), texture, + VK_IMAGE_LAYOUT_GENERAL, + VK_IMAGE_LAYOUT_SHADER_READ_ONLY_OPTIMAL, + VK_ACCESS_SHADER_READ_BIT | VK_ACCESS_SHADER_WRITE_BIT, VK_ACCESS_SHADER_READ_BIT, + VK_PIPELINE_STAGE_COMPUTE_SHADER_BIT, + VK_PIPELINE_STAGE_FRAGMENT_SHADER_BIT | VK_PIPELINE_STAGE_COMPUTE_SHADER_BIT); + } +} + void CShaderProgram::BindOutdatedDescriptorSets( CRingCommandContext& commandContext) { // TODO: combine calls after more sets to bind. - PS::StaticVector, 1> descriptortSets; + PS::StaticVector, 2> descriptortSets; if (m_TextureBinding.has_value() && m_TextureBinding->IsOutdated()) { constexpr uint32_t TEXTURE_BINDING_SET = 1u; descriptortSets.emplace_back(TEXTURE_BINDING_SET, m_TextureBinding->UpdateAndReturnDescriptorSet()); } + if (m_StorageImageBinding.has_value() && m_StorageImageBinding->IsOutdated()) + { + constexpr uint32_t STORAGE_IMAGE_BINDING_SET = 2u; + descriptortSets.emplace_back(STORAGE_IMAGE_BINDING_SET, m_StorageImageBinding->UpdateAndReturnDescriptorSet()); + } for (const auto [firstSet, descriptorSet] : descriptortSets) { vkCmdBindDescriptorSets( commandContext.GetCommandBuffer(), GetPipelineBindPoint(), GetPipelineLayout(), firstSet, 1, &descriptorSet, 0, nullptr); } } void CShaderProgram::SetUniform( const int32_t bindingSlot, const float value) { const float values[1] = {value}; SetUniform(bindingSlot, PS::span(values, values + 1)); } void CShaderProgram::SetUniform( const int32_t bindingSlot, const float valueX, const float valueY) { const float values[2] = {valueX, valueY}; SetUniform(bindingSlot, PS::span(values, values + 2)); } void CShaderProgram::SetUniform( const int32_t bindingSlot, const float valueX, const float valueY, const float valueZ) { const float values[3] = {valueX, valueY, valueZ}; SetUniform(bindingSlot, PS::span(values, values + 3)); } void CShaderProgram::SetUniform( const int32_t bindingSlot, const float valueX, const float valueY, const float valueZ, const float valueW) { const float values[4] = {valueX, valueY, valueZ, valueW}; SetUniform(bindingSlot, PS::span(values, values + 4)); } void CShaderProgram::SetUniform(const int32_t bindingSlot, PS::span values) { if (bindingSlot < 0) return; const auto data = GetUniformData(bindingSlot, values.size() * sizeof(float)); std::memcpy(data.first, values.data(), data.second); } std::pair CShaderProgram::GetUniformData( const int32_t bindingSlot, const uint32_t dataSize) { if (bindingSlot < static_cast(m_PushConstants.size())) { const uint32_t size = m_PushConstants[bindingSlot].size; const uint32_t offset = m_PushConstants[bindingSlot].offset; ENSURE(size <= dataSize); m_PushConstantDataMask |= ((1 << (size >> 2)) - 1) << (offset >> 2); return {m_PushConstantData.data() + offset, size}; } else { ENSURE(bindingSlot - m_PushConstants.size() < m_Uniforms.size()); const Uniform& uniform = m_Uniforms[bindingSlot - m_PushConstants.size()]; m_MaterialConstantsDataOutdated = true; const uint32_t size = uniform.size; const uint32_t offset = uniform.offset; ENSURE(size <= dataSize); return {m_MaterialConstantsData.get() + offset, size}; } } void CShaderProgram::SetTexture(const int32_t bindingSlot, CTexture* texture) { if (bindingSlot < 0) return; CDescriptorManager& descriptorManager = m_Device->GetDescriptorManager(); if (descriptorManager.UseDescriptorIndexing()) { const uint32_t descriptorIndex = descriptorManager.GetTextureDescriptor(texture->As()); ENSURE(bindingSlot < static_cast(m_PushConstants.size())); const uint32_t size = m_PushConstants[bindingSlot].size; const uint32_t offset = m_PushConstants[bindingSlot].offset; ENSURE(size == sizeof(descriptorIndex)); std::memcpy(m_PushConstantData.data() + offset, &descriptorIndex, size); m_PushConstantDataMask |= ((1 << (size >> 2)) - 1) << (offset >> 2); } else { ENSURE(bindingSlot >= static_cast(m_PushConstants.size() + m_UniformMapping.size())); ENSURE(m_TextureBinding.has_value()); const uint32_t index = bindingSlot - (m_PushConstants.size() + m_UniformMapping.size()); m_TextureBinding->SetObject(index, texture); } } +void CShaderProgram::SetStorageTexture(const int32_t bindingSlot, CTexture* texture) +{ + if (bindingSlot < 0) + return; + const int32_t offset = static_cast(m_PushConstants.size() + m_UniformMapping.size() + (m_TextureBinding.has_value() ? m_TextureBinding->GetBoundDeviceObjects().size() : 0)); + ENSURE(bindingSlot >= offset); + ENSURE(m_StorageImageBinding.has_value()); + const uint32_t index = bindingSlot - offset; + m_StorageImageBinding->SetObject(index, texture); +} + } // namespace Vulkan } // namespace Backend } // namespace Renderer Index: ps/trunk/source/renderer/backend/gl/ShaderProgram.cpp =================================================================== --- ps/trunk/source/renderer/backend/gl/ShaderProgram.cpp (revision 28009) +++ ps/trunk/source/renderer/backend/gl/ShaderProgram.cpp (revision 28010) @@ -1,1512 +1,1548 @@ -/* Copyright (C) 2023 Wildfire Games. +/* Copyright (C) 2024 Wildfire Games. * This file is part of 0 A.D. * * 0 A.D. is free software: you can redistribute it and/or modify * it under the terms of the GNU General Public License as published by * the Free Software Foundation, either version 2 of the License, or * (at your option) any later version. * * 0 A.D. is distributed in the hope that it will be useful, * but WITHOUT ANY WARRANTY; without even the implied warranty of * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the * GNU General Public License for more details. * * You should have received a copy of the GNU General Public License * along with 0 A.D. If not, see . */ #include "precompiled.h" #include "ShaderProgram.h" #include "graphics/Color.h" #include "graphics/PreprocessorWrapper.h" #include "graphics/ShaderManager.h" #include "graphics/TextureManager.h" #include "ps/CLogger.h" #include "ps/containers/StaticVector.h" #include "ps/Filesystem.h" #include "ps/Profile.h" #include "ps/XML/Xeromyces.h" #include "renderer/backend/gl/Device.h" #include "renderer/backend/gl/DeviceCommandContext.h" #define USE_SHADER_XML_VALIDATION 1 #if USE_SHADER_XML_VALIDATION #include "ps/XML/RelaxNG.h" #include "ps/XML/XMLWriter.h" #endif #include #include #include #include namespace Renderer { namespace Backend { namespace GL { namespace { struct Binding { Binding(int a, int b) : first(a), second(b) { } Binding() : first(-1), second(-1) { } /** * Returns whether this uniform attribute is active in the shader. * If not then there's no point calling Uniform() to set its value. */ bool Active() const { return first != -1 || second != -1; } int first; int second; }; int GetStreamMask(const VertexAttributeStream stream) { return 1 << static_cast(stream); } GLint GLSizeFromFormat(const Format format) { GLint size = 1; if (format == Renderer::Backend::Format::R32_SFLOAT || format == Renderer::Backend::Format::R16_SINT) size = 1; else if ( format == Renderer::Backend::Format::R8G8_UNORM || format == Renderer::Backend::Format::R8G8_UINT || format == Renderer::Backend::Format::R16G16_SINT || format == Renderer::Backend::Format::R32G32_SFLOAT) size = 2; else if (format == Renderer::Backend::Format::R32G32B32_SFLOAT) size = 3; else if ( format == Renderer::Backend::Format::R32G32B32A32_SFLOAT || format == Renderer::Backend::Format::R8G8B8A8_UNORM || format == Renderer::Backend::Format::R8G8B8A8_UINT) size = 4; else debug_warn("Unsupported format."); return size; } GLenum GLTypeFromFormat(const Format format) { GLenum type = GL_FLOAT; if (format == Renderer::Backend::Format::R32_SFLOAT || format == Renderer::Backend::Format::R32G32_SFLOAT || format == Renderer::Backend::Format::R32G32B32_SFLOAT || format == Renderer::Backend::Format::R32G32B32A32_SFLOAT) type = GL_FLOAT; else if ( format == Renderer::Backend::Format::R16_SINT || format == Renderer::Backend::Format::R16G16_SINT) type = GL_SHORT; else if ( format == Renderer::Backend::Format::R8G8_UNORM || format == Renderer::Backend::Format::R8G8_UINT || format == Renderer::Backend::Format::R8G8B8A8_UNORM || format == Renderer::Backend::Format::R8G8B8A8_UINT) type = GL_UNSIGNED_BYTE; else debug_warn("Unsupported format."); return type; } GLboolean NormalizedFromFormat(const Format format) { switch (format) { case Format::R8G8_UNORM: FALLTHROUGH; case Format::R8G8B8_UNORM: FALLTHROUGH; case Format::R8G8B8A8_UNORM: FALLTHROUGH; case Format::R16_UNORM: FALLTHROUGH; case Format::R16G16_UNORM: return GL_TRUE; default: break; } return GL_FALSE; } int GetAttributeLocationFromStream( CDevice* device, const VertexAttributeStream stream) { // Old mapping makes sense only if we have an old/low-end hardware. Else we // need to use sequential numbering to fix #3054. We use presence of // compute shaders as a check that the hardware has universal CUs. if (device->GetCapabilities().computeShaders) { return static_cast(stream); } else { // Map known semantics onto the attribute locations documented by NVIDIA: // https://download.nvidia.com/developer/Papers/2005/OpenGL_2.0/NVIDIA_OpenGL_2.0_Support.pdf // https://developer.download.nvidia.com/opengl/glsl/glsl_release_notes.pdf switch (stream) { case VertexAttributeStream::POSITION: return 0; case VertexAttributeStream::NORMAL: return 2; case VertexAttributeStream::COLOR: return 3; case VertexAttributeStream::UV0: return 8; case VertexAttributeStream::UV1: return 9; case VertexAttributeStream::UV2: return 10; case VertexAttributeStream::UV3: return 11; case VertexAttributeStream::UV4: return 12; case VertexAttributeStream::UV5: return 13; case VertexAttributeStream::UV6: return 14; case VertexAttributeStream::UV7: return 15; } } debug_warn("Invalid attribute semantics"); return 0; } bool PreprocessShaderFile( bool arb, const CShaderDefines& defines, const VfsPath& path, const char* stage, CStr& source, std::vector& fileDependencies) { CVFSFile file; if (file.Load(g_VFS, path) != PSRETURN_OK) { LOGERROR("Failed to load shader file: '%s'", path.string8()); return false; } CPreprocessorWrapper preprocessor( [arb, &fileDependencies](const CStr& includePath, CStr& out) -> bool { const VfsPath includeFilePath( (arb ? L"shaders/arb/" : L"shaders/glsl/") + wstring_from_utf8(includePath)); // Add dependencies anyway to reload the shader when the file is // appeared. fileDependencies.push_back(includeFilePath); CVFSFile includeFile; if (includeFile.Load(g_VFS, includeFilePath) != PSRETURN_OK) { LOGERROR("Failed to load shader include file: '%s'", includeFilePath.string8()); return false; } out = includeFile.GetAsString(); return true; }); preprocessor.AddDefines(defines); if (!arb) preprocessor.AddDefine(stage, "1"); #if CONFIG2_GLES if (!arb) { // GLES defines the macro "GL_ES" in its GLSL preprocessor, // but since we run our own preprocessor first, we need to explicitly // define it here preprocessor.AddDefine("GL_ES", "1"); } #endif source = preprocessor.Preprocess(file.GetAsString()); return true; } #if !CONFIG2_GLES std::tuple GetElementTypeAndCountFromString(const CStr& str) { #define CASE(MATCH_STRING, TYPE, ELEMENT_TYPE, ELEMENT_COUNT) \ if (str == MATCH_STRING) return {GL_ ## TYPE, GL_ ## ELEMENT_TYPE, ELEMENT_COUNT} CASE("float", FLOAT, FLOAT, 1); CASE("vec2", FLOAT_VEC2, FLOAT, 2); CASE("vec3", FLOAT_VEC3, FLOAT, 3); CASE("vec4", FLOAT_VEC4, FLOAT, 4); CASE("mat2", FLOAT_MAT2, FLOAT, 4); CASE("mat3", FLOAT_MAT3, FLOAT, 9); CASE("mat4", FLOAT_MAT4, FLOAT, 16); #if !CONFIG2_GLES // GL ES 2.0 doesn't support non-square matrices. CASE("mat2x3", FLOAT_MAT2x3, FLOAT, 6); CASE("mat2x4", FLOAT_MAT2x4, FLOAT, 8); CASE("mat3x2", FLOAT_MAT3x2, FLOAT, 6); CASE("mat3x4", FLOAT_MAT3x4, FLOAT, 12); CASE("mat4x2", FLOAT_MAT4x2, FLOAT, 8); CASE("mat4x3", FLOAT_MAT4x3, FLOAT, 12); #endif // A somewhat incomplete listing, missing "shadow" and "rect" versions // which are interpreted as 2D (NB: our shadowmaps may change // type based on user config). #if CONFIG2_GLES if (str == "sampler1D") debug_warn(L"sampler1D not implemented on GLES"); #else CASE("sampler1D", SAMPLER_1D, TEXTURE_1D, 1); #endif CASE("sampler2D", SAMPLER_2D, TEXTURE_2D, 1); #if CONFIG2_GLES if (str == "sampler2DShadow") debug_warn(L"sampler2DShadow not implemented on GLES"); if (str == "sampler3D") debug_warn(L"sampler3D not implemented on GLES"); #else CASE("sampler2DShadow", SAMPLER_2D_SHADOW, TEXTURE_2D, 1); CASE("sampler3D", SAMPLER_3D, TEXTURE_3D, 1); #endif CASE("samplerCube", SAMPLER_CUBE, TEXTURE_CUBE_MAP, 1); #undef CASE return {0, 0, 0}; } #endif // !CONFIG2_GLES bool CompileGLSL(GLuint shader, const VfsPath& file, const CStr& code) { const char* codeString = code.c_str(); GLint codeLength = code.length(); glShaderSource(shader, 1, &codeString, &codeLength); ogl_WarnIfError(); glCompileShader(shader); GLint ok = 0; glGetShaderiv(shader, GL_COMPILE_STATUS, &ok); GLint length = 0; glGetShaderiv(shader, GL_INFO_LOG_LENGTH, &length); // Apparently sometimes GL_INFO_LOG_LENGTH is incorrectly reported as 0 // (http://code.google.com/p/android/issues/detail?id=9953) if (!ok && length == 0) length = 4096; if (length > 1) { std::unique_ptr infolog = std::make_unique(length); glGetShaderInfoLog(shader, length, nullptr, infolog.get()); if (ok) LOGMESSAGE("Info when compiling shader '%s':\n%s", file.string8(), infolog.get()); else LOGERROR("Failed to compile shader '%s':\n%s", file.string8(), infolog.get()); } ogl_WarnIfError(); return ok; } } // anonymous namespace IDevice* CVertexInputLayout::GetDevice() { return m_Device; } #if !CONFIG2_GLES class CShaderProgramARB final : public CShaderProgram { public: CShaderProgramARB( CDevice* device, const VfsPath& path, const VfsPath& vertexFilePath, const VfsPath& fragmentFilePath, const CShaderDefines& defines, const std::map>& vertexIndices, const std::map>& fragmentIndices, int streamflags) : CShaderProgram(streamflags), m_Device(device) { glGenProgramsARB(1, &m_VertexProgram); glGenProgramsARB(1, &m_FragmentProgram); std::vector newFileDependencies = {path, vertexFilePath, fragmentFilePath}; CStr vertexCode; if (!PreprocessShaderFile(true, defines, vertexFilePath, "STAGE_VERTEX", vertexCode, newFileDependencies)) return; CStr fragmentCode; if (!PreprocessShaderFile(true, defines, fragmentFilePath, "STAGE_FRAGMENT", fragmentCode, newFileDependencies)) return; m_FileDependencies = std::move(newFileDependencies); // TODO: replace by scoped bind. m_Device->GetActiveCommandContext()->SetGraphicsPipelineState( MakeDefaultGraphicsPipelineStateDesc()); if (!Compile(GL_VERTEX_PROGRAM_ARB, "vertex", m_VertexProgram, vertexFilePath, vertexCode)) return; if (!Compile(GL_FRAGMENT_PROGRAM_ARB, "fragment", m_FragmentProgram, fragmentFilePath, fragmentCode)) return; for (const auto& index : vertexIndices) { BindingSlot& bindingSlot = GetOrCreateBindingSlot(index.first); bindingSlot.vertexProgramLocation = index.second.second; const auto [type, elementType, elementCount] = GetElementTypeAndCountFromString(index.second.first); bindingSlot.type = type; bindingSlot.elementType = elementType; bindingSlot.elementCount = elementCount; } for (const auto& index : fragmentIndices) { BindingSlot& bindingSlot = GetOrCreateBindingSlot(index.first); bindingSlot.fragmentProgramLocation = index.second.second; const auto [type, elementType, elementCount] = GetElementTypeAndCountFromString(index.second.first); if (bindingSlot.type && type != bindingSlot.type) { LOGERROR("CShaderProgramARB: vertex and fragment program uniforms with the same name should have the same type."); } bindingSlot.type = type; bindingSlot.elementType = elementType; bindingSlot.elementCount = elementCount; } } ~CShaderProgramARB() override { glDeleteProgramsARB(1, &m_VertexProgram); glDeleteProgramsARB(1, &m_FragmentProgram); } bool Compile(GLuint target, const char* targetName, GLuint program, const VfsPath& file, const CStr& code) { ogl_WarnIfError(); glBindProgramARB(target, program); ogl_WarnIfError(); glProgramStringARB(target, GL_PROGRAM_FORMAT_ASCII_ARB, (GLsizei)code.length(), code.c_str()); if (ogl_SquelchError(GL_INVALID_OPERATION)) { GLint errPos = 0; glGetIntegerv(GL_PROGRAM_ERROR_POSITION_ARB, &errPos); int errLine = std::count(code.begin(), code.begin() + std::min((int)code.length(), errPos + 1), '\n') + 1; char* errStr = (char*)glGetString(GL_PROGRAM_ERROR_STRING_ARB); LOGERROR("Failed to compile %s program '%s' (line %d):\n%s", targetName, file.string8(), errLine, errStr); return false; } glBindProgramARB(target, 0); ogl_WarnIfError(); return true; } void Bind(CShaderProgram* previousShaderProgram) override { if (previousShaderProgram) previousShaderProgram->Unbind(); glBindProgramARB(GL_VERTEX_PROGRAM_ARB, m_VertexProgram); glBindProgramARB(GL_FRAGMENT_PROGRAM_ARB, m_FragmentProgram); BindClientStates(); } void Unbind() override { glBindProgramARB(GL_VERTEX_PROGRAM_ARB, 0); glBindProgramARB(GL_FRAGMENT_PROGRAM_ARB, 0); UnbindClientStates(); } IDevice* GetDevice() override { return m_Device; } int32_t GetBindingSlot(const CStrIntern name) const override { auto it = m_BindingSlotsMapping.find(name); return it == m_BindingSlotsMapping.end() ? -1 : it->second; } TextureUnit GetTextureUnit(const int32_t bindingSlot) override { if (bindingSlot < 0 || bindingSlot >= static_cast(m_BindingSlots.size())) return { 0, 0, 0 }; TextureUnit textureUnit; textureUnit.type = m_BindingSlots[bindingSlot].type; textureUnit.target = m_BindingSlots[bindingSlot].elementType; textureUnit.unit = m_BindingSlots[bindingSlot].fragmentProgramLocation; return textureUnit; } void SetUniform( const int32_t bindingSlot, const float value) override { if (bindingSlot < 0 || bindingSlot >= static_cast(m_BindingSlots.size())) return; if (m_BindingSlots[bindingSlot].type != GL_FLOAT) { LOGERROR("CShaderProgramARB::SetUniform(): Invalid uniform type (expected float)"); return; } SetUniform(m_BindingSlots[bindingSlot], value, 0.0f, 0.0f, 0.0f); } void SetUniform( const int32_t bindingSlot, const float valueX, const float valueY) override { if (bindingSlot < 0 || bindingSlot >= static_cast(m_BindingSlots.size())) return; if (m_BindingSlots[bindingSlot].type != GL_FLOAT_VEC2) { LOGERROR("CShaderProgramARB::SetUniform(): Invalid uniform type (expected vec2)"); return; } SetUniform(m_BindingSlots[bindingSlot], valueX, valueY, 0.0f, 0.0f); } void SetUniform( const int32_t bindingSlot, const float valueX, const float valueY, const float valueZ) override { if (bindingSlot < 0 || bindingSlot >= static_cast(m_BindingSlots.size())) return; if (m_BindingSlots[bindingSlot].type != GL_FLOAT_VEC3) { LOGERROR("CShaderProgramARB::SetUniform(): Invalid uniform type (expected vec3)"); return; } SetUniform(m_BindingSlots[bindingSlot], valueX, valueY, valueZ, 0.0f); } void SetUniform( const int32_t bindingSlot, const float valueX, const float valueY, const float valueZ, const float valueW) override { if (bindingSlot < 0 || bindingSlot >= static_cast(m_BindingSlots.size())) return; if (m_BindingSlots[bindingSlot].type != GL_FLOAT_VEC4) { LOGERROR("CShaderProgramARB::SetUniform(): Invalid uniform type (expected vec4)"); return; } SetUniform(m_BindingSlots[bindingSlot], valueX, valueY, valueZ, valueW); } void SetUniform( const int32_t bindingSlot, PS::span values) override { if (bindingSlot < 0 || bindingSlot >= static_cast(m_BindingSlots.size())) return; if (m_BindingSlots[bindingSlot].elementType != GL_FLOAT) { LOGERROR("CShaderProgramARB::SetUniform(): Invalid uniform element type (expected float)"); return; } if (m_BindingSlots[bindingSlot].elementCount > static_cast(values.size())) { LOGERROR( "CShaderProgramARB::SetUniform(): Invalid uniform element count (expected: %zu passed: %zu)", m_BindingSlots[bindingSlot].elementCount, values.size()); return; } const GLenum type = m_BindingSlots[bindingSlot].type; if (type == GL_FLOAT) SetUniform(m_BindingSlots[bindingSlot], values[0], 0.0f, 0.0f, 0.0f); else if (type == GL_FLOAT_VEC2) SetUniform(m_BindingSlots[bindingSlot], values[0], values[1], 0.0f, 0.0f); else if (type == GL_FLOAT_VEC3) SetUniform(m_BindingSlots[bindingSlot], values[0], values[1], values[2], 0.0f); else if (type == GL_FLOAT_VEC4) SetUniform(m_BindingSlots[bindingSlot], values[0], values[1], values[2], values[3]); else if (type == GL_FLOAT_MAT4) SetUniformMatrix(m_BindingSlots[bindingSlot], values); else LOGERROR("CShaderProgramARB::SetUniform(): Invalid uniform type (expected float, vec2, vec3, vec4, mat4)"); ogl_WarnIfError(); } std::vector GetFileDependencies() const override { return m_FileDependencies; } private: struct BindingSlot { CStrIntern name; int vertexProgramLocation; int fragmentProgramLocation; GLenum type; GLenum elementType; GLint elementCount; }; BindingSlot& GetOrCreateBindingSlot(const CStrIntern name) { auto it = m_BindingSlotsMapping.find(name); if (it == m_BindingSlotsMapping.end()) { m_BindingSlotsMapping[name] = m_BindingSlots.size(); BindingSlot bindingSlot{}; bindingSlot.name = name; bindingSlot.vertexProgramLocation = -1; bindingSlot.fragmentProgramLocation = -1; bindingSlot.elementType = 0; bindingSlot.elementCount = 0; m_BindingSlots.emplace_back(std::move(bindingSlot)); return m_BindingSlots.back(); } else return m_BindingSlots[it->second]; } void SetUniform( const BindingSlot& bindingSlot, const float v0, const float v1, const float v2, const float v3) { SetUniform(GL_VERTEX_PROGRAM_ARB, bindingSlot.vertexProgramLocation, v0, v1, v2, v3); SetUniform(GL_FRAGMENT_PROGRAM_ARB, bindingSlot.fragmentProgramLocation, v0, v1, v2, v3); } void SetUniform( const GLenum target, const int location, const float v0, const float v1, const float v2, const float v3) { if (location >= 0) { glProgramLocalParameter4fARB( target, static_cast(location), v0, v1, v2, v3); } } void SetUniformMatrix( const BindingSlot& bindingSlot, PS::span values) { const size_t mat4ElementCount = 16; ENSURE(values.size() == mat4ElementCount); SetUniformMatrix(GL_VERTEX_PROGRAM_ARB, bindingSlot.vertexProgramLocation, values); SetUniformMatrix(GL_FRAGMENT_PROGRAM_ARB, bindingSlot.fragmentProgramLocation, values); } void SetUniformMatrix( const GLenum target, const int location, PS::span values) { if (location >= 0) { glProgramLocalParameter4fARB( target, static_cast(location + 0), values[0], values[4], values[8], values[12]); glProgramLocalParameter4fARB( target, static_cast(location + 1), values[1], values[5], values[9], values[13]); glProgramLocalParameter4fARB( target, static_cast(location + 2), values[2], values[6], values[10], values[14]); glProgramLocalParameter4fARB( target, static_cast(location + 3), values[3], values[7], values[11], values[15]); } } CDevice* m_Device = nullptr; std::vector m_FileDependencies; GLuint m_VertexProgram; GLuint m_FragmentProgram; std::vector m_BindingSlots; std::unordered_map m_BindingSlotsMapping; }; #endif // !CONFIG2_GLES class CShaderProgramGLSL final : public CShaderProgram { public: CShaderProgramGLSL( CDevice* device, const CStr& name, const VfsPath& programPath, PS::span> shaderStages, const CShaderDefines& defines, const std::map& vertexAttribs, int streamflags) : CShaderProgram(streamflags), m_Device(device), m_Name(name), m_VertexAttribs(vertexAttribs) { for (std::map::iterator it = m_VertexAttribs.begin(); it != m_VertexAttribs.end(); ++it) m_ActiveVertexAttributes.emplace_back(it->second); std::sort(m_ActiveVertexAttributes.begin(), m_ActiveVertexAttributes.end()); m_Program = 0; m_FileDependencies = {programPath}; for (const auto& [path, type] : shaderStages) + { + UNUSED2(type); m_FileDependencies.emplace_back(path); + } // TODO: replace by scoped bind. m_Device->GetActiveCommandContext()->SetGraphicsPipelineState( MakeDefaultGraphicsPipelineStateDesc()); std::vector newFileDependencies = {programPath}; for (const auto& [path, type] : shaderStages) { GLuint shader = glCreateShader(type); newFileDependencies.emplace_back(path); #if !CONFIG2_GLES if (m_Device->GetCapabilities().debugLabels) glObjectLabel(GL_SHADER, shader, -1, path.string8().c_str()); #endif m_ShaderStages.emplace_back(type, shader); const char* stageDefine = "STAGE_UNDEFINED"; switch (type) { case GL_VERTEX_SHADER: stageDefine = "STAGE_VERTEX"; break; case GL_FRAGMENT_SHADER: stageDefine = "STAGE_FRAGMENT"; break; + case GL_COMPUTE_SHADER: + stageDefine = "STAGE_COMPUTE"; + break; default: break; } CStr source; if (!PreprocessShaderFile(false, defines, path, stageDefine, source, newFileDependencies)) return; if (source.empty()) { LOGERROR("Failed to preprocess shader: '%s'", path.string8()); return; } #if CONFIG2_GLES // Ugly hack to replace desktop GLSL 1.10/1.20 with GLSL ES 1.00, // and also to set default float precision for fragment shaders source.Replace("#version 110\n", "#version 100\nprecision highp float;\n"); source.Replace("#version 110\r\n", "#version 100\nprecision highp float;\n"); source.Replace("#version 120\n", "#version 100\nprecision highp float;\n"); source.Replace("#version 120\r\n", "#version 100\nprecision highp float;\n"); #endif if (!CompileGLSL(shader, path, source)) return; } m_FileDependencies = std::move(newFileDependencies); if (!Link(programPath)) return; } ~CShaderProgramGLSL() override { if (m_Program) glDeleteProgram(m_Program); for (ShaderStage& stage : m_ShaderStages) glDeleteShader(stage.shader); } bool Link(const VfsPath& path) { ENSURE(!m_Program); m_Program = glCreateProgram(); #if !CONFIG2_GLES if (m_Device->GetCapabilities().debugLabels) { glObjectLabel(GL_PROGRAM, m_Program, -1, m_Name.c_str()); } #endif for (ShaderStage& stage : m_ShaderStages) { glAttachShader(m_Program, stage.shader); ogl_WarnIfError(); } // Set up the attribute bindings explicitly, since apparently drivers // don't always pick the most efficient bindings automatically, // and also this lets us hardcode indexes into VertexPointer etc for (std::map::iterator it = m_VertexAttribs.begin(); it != m_VertexAttribs.end(); ++it) glBindAttribLocation(m_Program, it->second, it->first.c_str()); glLinkProgram(m_Program); GLint ok = 0; glGetProgramiv(m_Program, GL_LINK_STATUS, &ok); GLint length = 0; glGetProgramiv(m_Program, GL_INFO_LOG_LENGTH, &length); if (!ok && length == 0) length = 4096; if (length > 1) { char* infolog = new char[length]; glGetProgramInfoLog(m_Program, length, NULL, infolog); if (ok) LOGMESSAGE("Info when linking program '%s':\n%s", path.string8(), infolog); else LOGERROR("Failed to link program '%s':\n%s", path.string8(), infolog); delete[] infolog; } ogl_WarnIfError(); if (!ok) return false; Bind(nullptr); ogl_WarnIfError(); // Reorder sampler units to decrease redundant texture unit changes when // samplers bound in a different order. const std::unordered_map requiredUnits = { {CStrIntern("baseTex"), 0}, {CStrIntern("normTex"), 1}, {CStrIntern("specTex"), 2}, {CStrIntern("aoTex"), 3}, {CStrIntern("shadowTex"), 4}, {CStrIntern("losTex"), 5}, }; std::vector occupiedUnits; GLint numUniforms = 0; glGetProgramiv(m_Program, GL_ACTIVE_UNIFORMS, &numUniforms); ogl_WarnIfError(); for (GLint i = 0; i < numUniforms; ++i) { // TODO: use GL_ACTIVE_UNIFORM_MAX_LENGTH for the size. char name[256] = {0}; GLsizei nameLength = 0; GLint size = 0; GLenum type = 0; glGetActiveUniform(m_Program, i, ARRAY_SIZE(name), &nameLength, &size, &type, name); ogl_WarnIfError(); const GLint location = glGetUniformLocation(m_Program, name); // OpenGL specification is a bit vague about a name returned by glGetActiveUniform. // NVIDIA drivers return uniform name with "[0]", Intel Windows drivers without; while (nameLength > 3 && name[nameLength - 3] == '[' && name[nameLength - 2] == '0' && name[nameLength - 1] == ']') { nameLength -= 3; } name[nameLength] = 0; const CStrIntern nameIntern(name); m_BindingSlotsMapping[nameIntern] = m_BindingSlots.size(); BindingSlot bindingSlot{}; bindingSlot.name = nameIntern; bindingSlot.location = location; bindingSlot.size = size; bindingSlot.type = type; bindingSlot.isTexture = false; #define CASE(TYPE, ELEMENT_TYPE, ELEMENT_COUNT) \ case GL_ ## TYPE: \ bindingSlot.elementType = GL_ ## ELEMENT_TYPE; \ bindingSlot.elementCount = ELEMENT_COUNT; \ break; switch (type) { CASE(FLOAT, FLOAT, 1); CASE(FLOAT_VEC2, FLOAT, 2); CASE(FLOAT_VEC3, FLOAT, 3); CASE(FLOAT_VEC4, FLOAT, 4); CASE(INT, INT, 1); CASE(FLOAT_MAT2, FLOAT, 4); CASE(FLOAT_MAT3, FLOAT, 9); CASE(FLOAT_MAT4, FLOAT, 16); #if !CONFIG2_GLES // GL ES 2.0 doesn't support non-square matrices. CASE(FLOAT_MAT2x3, FLOAT, 6); CASE(FLOAT_MAT2x4, FLOAT, 8); CASE(FLOAT_MAT3x2, FLOAT, 6); CASE(FLOAT_MAT3x4, FLOAT, 12); CASE(FLOAT_MAT4x2, FLOAT, 8); CASE(FLOAT_MAT4x3, FLOAT, 12); #endif } #undef CASE // Assign sampler uniforms to sequential texture units. - if (type == GL_SAMPLER_2D - || type == GL_SAMPLER_CUBE + switch (type) + { + case GL_SAMPLER_2D: + bindingSlot.elementType = GL_TEXTURE_2D; + bindingSlot.isTexture = true; + break; + case GL_SAMPLER_CUBE: + bindingSlot.elementType = GL_TEXTURE_CUBE_MAP; + bindingSlot.isTexture = true; + break; #if !CONFIG2_GLES - || type == GL_SAMPLER_2D_SHADOW + case GL_SAMPLER_2D_SHADOW: + bindingSlot.elementType = GL_TEXTURE_2D; + bindingSlot.isTexture = true; + break; + case GL_IMAGE_2D: + bindingSlot.elementType = GL_IMAGE_2D; + bindingSlot.isTexture = true; + break; #endif - ) + default: + break; + } + + if (bindingSlot.isTexture) { const auto it = requiredUnits.find(nameIntern); const int unit = it == requiredUnits.end() ? -1 : it->second; - bindingSlot.elementType = (type == GL_SAMPLER_CUBE ? GL_TEXTURE_CUBE_MAP : GL_TEXTURE_2D); bindingSlot.elementCount = unit; - bindingSlot.isTexture = true; if (unit != -1) { if (unit >= static_cast(occupiedUnits.size())) occupiedUnits.resize(unit + 1); occupiedUnits[unit] = true; } } if (bindingSlot.elementType == 0) { LOGERROR("CShaderProgramGLSL::Link: unsupported uniform type: 0x%04x", static_cast(type)); } m_BindingSlots.emplace_back(std::move(bindingSlot)); } for (BindingSlot& bindingSlot : m_BindingSlots) { if (!bindingSlot.isTexture) continue; if (bindingSlot.elementCount == -1) { // We need to find a minimal available unit. int unit = 0; while (unit < static_cast(occupiedUnits.size()) && occupiedUnits[unit]) ++unit; if (unit >= static_cast(occupiedUnits.size())) occupiedUnits.resize(unit + 1); occupiedUnits[unit] = true; bindingSlot.elementCount = unit; } // Link uniform to unit. glUniform1i(bindingSlot.location, bindingSlot.elementCount); ogl_WarnIfError(); } // TODO: verify that we're not using more samplers than is supported Unbind(); return true; } void Bind(CShaderProgram* previousShaderProgram) override { CShaderProgramGLSL* previousShaderProgramGLSL = nullptr; if (previousShaderProgram) previousShaderProgramGLSL = static_cast(previousShaderProgram); ENSURE(this != previousShaderProgramGLSL); glUseProgram(m_Program); if (previousShaderProgramGLSL) { std::vector::iterator itPrevious = previousShaderProgramGLSL->m_ActiveVertexAttributes.begin(); std::vector::iterator itNext = m_ActiveVertexAttributes.begin(); while ( itPrevious != previousShaderProgramGLSL->m_ActiveVertexAttributes.end() || itNext != m_ActiveVertexAttributes.end()) { if (itPrevious != previousShaderProgramGLSL->m_ActiveVertexAttributes.end() && itNext != m_ActiveVertexAttributes.end()) { if (*itPrevious == *itNext) { ++itPrevious; ++itNext; } else if (*itPrevious < *itNext) { glDisableVertexAttribArray(*itPrevious); ++itPrevious; } else if (*itPrevious > *itNext) { glEnableVertexAttribArray(*itNext); ++itNext; } } else if (itPrevious != previousShaderProgramGLSL->m_ActiveVertexAttributes.end()) { glDisableVertexAttribArray(*itPrevious); ++itPrevious; } else if (itNext != m_ActiveVertexAttributes.end()) { glEnableVertexAttribArray(*itNext); ++itNext; } } } else { for (const int index : m_ActiveVertexAttributes) glEnableVertexAttribArray(index); } m_ValidStreams = 0; } void Unbind() override { glUseProgram(0); for (const int index : m_ActiveVertexAttributes) glDisableVertexAttribArray(index); } IDevice* GetDevice() override { return m_Device; } int32_t GetBindingSlot(const CStrIntern name) const override { auto it = m_BindingSlotsMapping.find(name); return it == m_BindingSlotsMapping.end() ? -1 : it->second; } TextureUnit GetTextureUnit(const int32_t bindingSlot) override { if (bindingSlot < 0 || bindingSlot >= static_cast(m_BindingSlots.size())) return { 0, 0, 0 }; TextureUnit textureUnit; textureUnit.type = m_BindingSlots[bindingSlot].type; textureUnit.target = m_BindingSlots[bindingSlot].elementType; textureUnit.unit = m_BindingSlots[bindingSlot].elementCount; return textureUnit; } void SetUniform( const int32_t bindingSlot, const float value) override { if (bindingSlot < 0 || bindingSlot >= static_cast(m_BindingSlots.size())) return; if (m_BindingSlots[bindingSlot].type != GL_FLOAT || m_BindingSlots[bindingSlot].size != 1) { LOGERROR("CShaderProgramGLSL::SetUniform(): Invalid uniform type (expected float) '%s'", m_BindingSlots[bindingSlot].name.c_str()); return; } glUniform1f(m_BindingSlots[bindingSlot].location, value); ogl_WarnIfError(); } void SetUniform( const int32_t bindingSlot, const float valueX, const float valueY) override { if (bindingSlot < 0 || bindingSlot >= static_cast(m_BindingSlots.size())) return; if (m_BindingSlots[bindingSlot].type != GL_FLOAT_VEC2 || m_BindingSlots[bindingSlot].size != 1) { LOGERROR("CShaderProgramGLSL::SetUniform(): Invalid uniform type (expected vec2) '%s'", m_BindingSlots[bindingSlot].name.c_str()); return; } glUniform2f(m_BindingSlots[bindingSlot].location, valueX, valueY); ogl_WarnIfError(); } void SetUniform( const int32_t bindingSlot, const float valueX, const float valueY, const float valueZ) override { if (bindingSlot < 0 || bindingSlot >= static_cast(m_BindingSlots.size())) return; if (m_BindingSlots[bindingSlot].type != GL_FLOAT_VEC3 || m_BindingSlots[bindingSlot].size != 1) { LOGERROR("CShaderProgramGLSL::SetUniform(): Invalid uniform type (expected vec3) '%s'", m_BindingSlots[bindingSlot].name.c_str()); return; } glUniform3f(m_BindingSlots[bindingSlot].location, valueX, valueY, valueZ); ogl_WarnIfError(); } void SetUniform( const int32_t bindingSlot, const float valueX, const float valueY, const float valueZ, const float valueW) override { if (bindingSlot < 0 || bindingSlot >= static_cast(m_BindingSlots.size())) return; if (m_BindingSlots[bindingSlot].type != GL_FLOAT_VEC4 || m_BindingSlots[bindingSlot].size != 1) { LOGERROR("CShaderProgramGLSL::SetUniform(): Invalid uniform type (expected vec4) '%s'", m_BindingSlots[bindingSlot].name.c_str()); return; } glUniform4f(m_BindingSlots[bindingSlot].location, valueX, valueY, valueZ, valueW); ogl_WarnIfError(); } void SetUniform( const int32_t bindingSlot, PS::span values) override { if (bindingSlot < 0 || bindingSlot >= static_cast(m_BindingSlots.size())) return; if (m_BindingSlots[bindingSlot].elementType != GL_FLOAT) { LOGERROR("CShaderProgramGLSL::SetUniform(): Invalid uniform element type (expected float) '%s'", m_BindingSlots[bindingSlot].name.c_str()); return; } if (m_BindingSlots[bindingSlot].size == 1 && m_BindingSlots[bindingSlot].elementCount > static_cast(values.size())) { LOGERROR( "CShaderProgramGLSL::SetUniform(): Invalid uniform element count (expected: %zu passed: %zu) '%s'", m_BindingSlots[bindingSlot].elementCount, values.size(), m_BindingSlots[bindingSlot].name.c_str()); return; } const GLint location = m_BindingSlots[bindingSlot].location; const GLenum type = m_BindingSlots[bindingSlot].type; if (type == GL_FLOAT) glUniform1fv(location, 1, values.data()); else if (type == GL_FLOAT_VEC2) glUniform2fv(location, 1, values.data()); else if (type == GL_FLOAT_VEC3) glUniform3fv(location, 1, values.data()); else if (type == GL_FLOAT_VEC4) glUniform4fv(location, 1, values.data()); else if (type == GL_FLOAT_MAT4) { // For case of array of matrices we might pass less number of matrices. const GLint size = std::min( m_BindingSlots[bindingSlot].size, static_cast(values.size() / 16)); glUniformMatrix4fv(location, size, GL_FALSE, values.data()); } else LOGERROR("CShaderProgramGLSL::SetUniform(): Invalid uniform type (expected float, vec2, vec3, vec4, mat4) '%s'", m_BindingSlots[bindingSlot].name.c_str()); ogl_WarnIfError(); } void VertexAttribPointer( const VertexAttributeStream stream, const Format format, const uint32_t offset, const uint32_t stride, const VertexAttributeRate rate, const void* data) override { const int attributeLocation = GetAttributeLocationFromStream(m_Device, stream); std::vector::const_iterator it = std::lower_bound(m_ActiveVertexAttributes.begin(), m_ActiveVertexAttributes.end(), attributeLocation); if (it == m_ActiveVertexAttributes.end() || *it != attributeLocation) return; const GLint size = GLSizeFromFormat(format); const GLenum type = GLTypeFromFormat(format); const GLboolean normalized = NormalizedFromFormat(format); glVertexAttribPointer( attributeLocation, size, type, normalized, stride, static_cast(data) + offset); #if CONFIG2_GLES ENSURE(!m_Device->GetCapabilities().instancing); UNUSED2(rate); #else if (rate == VertexAttributeRate::PER_INSTANCE) ENSURE(m_Device->GetCapabilities().instancing); if (m_Device->GetCapabilities().instancing) { glVertexAttribDivisorARB(attributeLocation, rate == VertexAttributeRate::PER_INSTANCE ? 1 : 0); } #endif m_ValidStreams |= GetStreamMask(stream); } std::vector GetFileDependencies() const override { return m_FileDependencies; } private: struct ShaderStage { GLenum type; GLuint shader; }; CDevice* m_Device = nullptr; CStr m_Name; std::vector m_FileDependencies; std::map m_VertexAttribs; // Sorted list of active vertex attributes. std::vector m_ActiveVertexAttributes; GLuint m_Program; // 5 = max(compute, vertex + tesselation (control + evaluation) + geometry + fragment). PS::StaticVector m_ShaderStages; struct BindingSlot { CStrIntern name; GLint location; GLint size; GLenum type; GLenum elementType; GLint elementCount; bool isTexture; }; std::vector m_BindingSlots; std::unordered_map m_BindingSlotsMapping; }; CShaderProgram::CShaderProgram(int streamflags) : m_StreamFlags(streamflags), m_ValidStreams(0) { } CShaderProgram::~CShaderProgram() = default; // static std::unique_ptr CShaderProgram::Create(CDevice* device, const CStr& name, const CShaderDefines& baseDefines) { PROFILE2("loading shader"); PROFILE2_ATTR("name: %s", name.c_str()); VfsPath xmlFilename = L"shaders/" + wstring_from_utf8(name) + L".xml"; CXeromyces XeroFile; PSRETURN ret = XeroFile.Load(g_VFS, xmlFilename); if (ret != PSRETURN_OK) return nullptr; #if USE_SHADER_XML_VALIDATION { // Serialize the XMB data and pass it to the validator XMLWriter_File shaderFile; shaderFile.SetPrettyPrint(false); shaderFile.XMB(XeroFile); bool ok = CXeromyces::ValidateEncoded("shader", name, shaderFile.GetOutput()); if (!ok) return nullptr; } #endif // Define all the elements and attributes used in the XML file #define EL(x) int el_##x = XeroFile.GetElementID(#x) #define AT(x) int at_##x = XeroFile.GetAttributeID(#x) + EL(compute); EL(define); EL(fragment); EL(stream); EL(uniform); EL(vertex); AT(attribute); AT(file); AT(if); AT(loc); AT(name); AT(type); AT(value); #undef AT #undef EL CPreprocessorWrapper preprocessor; preprocessor.AddDefines(baseDefines); XMBElement root = XeroFile.GetRoot(); const bool isGLSL = root.GetAttributes().GetNamedItem(at_type) == "glsl"; VfsPath vertexFile; VfsPath fragmentFile; CShaderDefines defines = baseDefines; std::map> vertexUniforms; std::map> fragmentUniforms; std::map vertexAttribs; int streamFlags = 0; + VfsPath computeFile; + XERO_ITER_EL(root, child) { if (child.GetNodeName() == el_define) { defines.Add(CStrIntern(child.GetAttributes().GetNamedItem(at_name)), CStrIntern(child.GetAttributes().GetNamedItem(at_value))); } else if (child.GetNodeName() == el_vertex) { vertexFile = L"shaders/" + child.GetAttributes().GetNamedItem(at_file).FromUTF8(); XERO_ITER_EL(child, param) { XMBAttributeList attributes = param.GetAttributes(); CStr cond = attributes.GetNamedItem(at_if); if (!cond.empty() && !preprocessor.TestConditional(cond)) continue; if (param.GetNodeName() == el_uniform) { vertexUniforms[CStrIntern(attributes.GetNamedItem(at_name))] = std::make_pair(attributes.GetNamedItem(at_type), attributes.GetNamedItem(at_loc).ToInt()); } else if (param.GetNodeName() == el_stream) { const CStr streamName = attributes.GetNamedItem(at_name); const CStr attributeName = attributes.GetNamedItem(at_attribute); if (attributeName.empty() && isGLSL) LOGERROR("Empty attribute name in vertex shader description '%s'", vertexFile.string8().c_str()); VertexAttributeStream stream = VertexAttributeStream::UV7; if (streamName == "pos") stream = VertexAttributeStream::POSITION; else if (streamName == "normal") stream = VertexAttributeStream::NORMAL; else if (streamName == "color") stream = VertexAttributeStream::COLOR; else if (streamName == "uv0") stream = VertexAttributeStream::UV0; else if (streamName == "uv1") stream = VertexAttributeStream::UV1; else if (streamName == "uv2") stream = VertexAttributeStream::UV2; else if (streamName == "uv3") stream = VertexAttributeStream::UV3; else if (streamName == "uv4") stream = VertexAttributeStream::UV4; else if (streamName == "uv5") stream = VertexAttributeStream::UV5; else if (streamName == "uv6") stream = VertexAttributeStream::UV6; else if (streamName == "uv7") stream = VertexAttributeStream::UV7; else LOGERROR("Unknown stream '%s' in vertex shader description '%s'", streamName.c_str(), vertexFile.string8().c_str()); if (isGLSL) { const int attributeLocation = GetAttributeLocationFromStream(device, stream); vertexAttribs[CStrIntern(attributeName)] = attributeLocation; } streamFlags |= GetStreamMask(stream); } } } else if (child.GetNodeName() == el_fragment) { fragmentFile = L"shaders/" + child.GetAttributes().GetNamedItem(at_file).FromUTF8(); XERO_ITER_EL(child, param) { XMBAttributeList attributes = param.GetAttributes(); CStr cond = attributes.GetNamedItem(at_if); if (!cond.empty() && !preprocessor.TestConditional(cond)) continue; if (param.GetNodeName() == el_uniform) { fragmentUniforms[CStrIntern(attributes.GetNamedItem(at_name))] = std::make_pair(attributes.GetNamedItem(at_type), attributes.GetNamedItem(at_loc).ToInt()); } } } + else if (child.GetNodeName() == el_compute) + { + computeFile = L"shaders/" + child.GetAttributes().GetNamedItem(at_file).FromUTF8(); + } } if (isGLSL) { - const std::array, 2> shaderStages{{ - {vertexFile, GL_VERTEX_SHADER}, {fragmentFile, GL_FRAGMENT_SHADER}}}; + if (!computeFile.empty()) + { + ENSURE(streamFlags == 0); + ENSURE(vertexAttribs.empty()); + } + const PS::StaticVector, 2> shaderStages{computeFile.empty() + ? PS::StaticVector, 2>{{vertexFile, GL_VERTEX_SHADER}, {fragmentFile, GL_FRAGMENT_SHADER}} + : PS::StaticVector, 2>{{computeFile, GL_COMPUTE_SHADER}}}; return std::make_unique( device, name, xmlFilename, shaderStages, defines, vertexAttribs, streamFlags); } else { #if CONFIG2_GLES LOGERROR("CShaderProgram::Create: '%s'+'%s': ARB shaders not supported on this device", vertexFile.string8(), fragmentFile.string8()); return nullptr; #else return std::make_unique( device, xmlFilename, vertexFile, fragmentFile, defines, vertexUniforms, fragmentUniforms, streamFlags); #endif } } // These should all be overridden by CShaderProgramGLSL, and not used // if a non-GLSL shader was loaded instead: #if CONFIG2_GLES // These should all be overridden by CShaderProgramGLSL // (GLES doesn't support any other types of shader program): void CShaderProgram::VertexPointer(const Renderer::Backend::Format UNUSED(format), GLsizei UNUSED(stride), const void* UNUSED(pointer)) { debug_warn("CShaderProgram::VertexPointer should be overridden"); } void CShaderProgram::NormalPointer(const Renderer::Backend::Format UNUSED(format), GLsizei UNUSED(stride), const void* UNUSED(pointer)) { debug_warn("CShaderProgram::NormalPointer should be overridden"); } void CShaderProgram::ColorPointer(const Renderer::Backend::Format UNUSED(format), GLsizei UNUSED(stride), const void* UNUSED(pointer)) { debug_warn("CShaderProgram::ColorPointer should be overridden"); } void CShaderProgram::TexCoordPointer(GLenum UNUSED(texture), const Renderer::Backend::Format UNUSED(format), GLsizei UNUSED(stride), const void* UNUSED(pointer)) { debug_warn("CShaderProgram::TexCoordPointer should be overridden"); } #else // These are overridden by CShaderProgramGLSL, but fixed-function and ARB shaders // both use the fixed-function vertex attribute pointers so we'll share their // definitions here: void CShaderProgram::VertexPointer(const Renderer::Backend::Format format, GLsizei stride, const void* pointer) { const GLint size = GLSizeFromFormat(format); ENSURE(2 <= size && size <= 4); const GLenum type = GLTypeFromFormat(format); glVertexPointer(size, type, stride, pointer); m_ValidStreams |= GetStreamMask(VertexAttributeStream::POSITION); } void CShaderProgram::NormalPointer(const Renderer::Backend::Format format, GLsizei stride, const void* pointer) { ENSURE(format == Renderer::Backend::Format::R32G32B32_SFLOAT); glNormalPointer(GL_FLOAT, stride, pointer); m_ValidStreams |= GetStreamMask(VertexAttributeStream::NORMAL); } void CShaderProgram::ColorPointer(const Renderer::Backend::Format format, GLsizei stride, const void* pointer) { const GLint size = GLSizeFromFormat(format); ENSURE(3 <= size && size <= 4); const GLenum type = GLTypeFromFormat(format); glColorPointer(size, type, stride, pointer); m_ValidStreams |= GetStreamMask(VertexAttributeStream::COLOR); } void CShaderProgram::TexCoordPointer(GLenum texture, const Renderer::Backend::Format format, GLsizei stride, const void* pointer) { glClientActiveTextureARB(texture); const GLint size = GLSizeFromFormat(format); ENSURE(1 <= size && size <= 4); const GLenum type = GLTypeFromFormat(format); glTexCoordPointer(size, type, stride, pointer); glClientActiveTextureARB(GL_TEXTURE0); m_ValidStreams |= GetStreamMask(VertexAttributeStream::UV0) << (texture - GL_TEXTURE0); } void CShaderProgram::BindClientStates() { ENSURE(m_StreamFlags == (m_StreamFlags & ( GetStreamMask(VertexAttributeStream::POSITION) | GetStreamMask(VertexAttributeStream::NORMAL) | GetStreamMask(VertexAttributeStream::COLOR) | GetStreamMask(VertexAttributeStream::UV0) | GetStreamMask(VertexAttributeStream::UV1)))); // Enable all the desired client states for non-GLSL rendering if (m_StreamFlags & GetStreamMask(VertexAttributeStream::POSITION)) glEnableClientState(GL_VERTEX_ARRAY); if (m_StreamFlags & GetStreamMask(VertexAttributeStream::NORMAL)) glEnableClientState(GL_NORMAL_ARRAY); if (m_StreamFlags & GetStreamMask(VertexAttributeStream::COLOR)) glEnableClientState(GL_COLOR_ARRAY); if (m_StreamFlags & GetStreamMask(VertexAttributeStream::UV0)) { glClientActiveTextureARB(GL_TEXTURE0); glEnableClientState(GL_TEXTURE_COORD_ARRAY); } if (m_StreamFlags & GetStreamMask(VertexAttributeStream::UV1)) { glClientActiveTextureARB(GL_TEXTURE1); glEnableClientState(GL_TEXTURE_COORD_ARRAY); glClientActiveTextureARB(GL_TEXTURE0); } // Rendering code must subsequently call VertexPointer etc for all of the streams // that were activated in this function, else AssertPointersBound will complain // that some arrays were unspecified m_ValidStreams = 0; } void CShaderProgram::UnbindClientStates() { if (m_StreamFlags & GetStreamMask(VertexAttributeStream::POSITION)) glDisableClientState(GL_VERTEX_ARRAY); if (m_StreamFlags & GetStreamMask(VertexAttributeStream::NORMAL)) glDisableClientState(GL_NORMAL_ARRAY); if (m_StreamFlags & GetStreamMask(VertexAttributeStream::COLOR)) glDisableClientState(GL_COLOR_ARRAY); if (m_StreamFlags & GetStreamMask(VertexAttributeStream::UV0)) { glClientActiveTextureARB(GL_TEXTURE0); glDisableClientState(GL_TEXTURE_COORD_ARRAY); } if (m_StreamFlags & GetStreamMask(VertexAttributeStream::UV1)) { glClientActiveTextureARB(GL_TEXTURE1); glDisableClientState(GL_TEXTURE_COORD_ARRAY); glClientActiveTextureARB(GL_TEXTURE0); } } #endif // !CONFIG2_GLES bool CShaderProgram::IsStreamActive(const VertexAttributeStream stream) const { return (m_StreamFlags & GetStreamMask(stream)) != 0; } void CShaderProgram::VertexAttribPointer( const VertexAttributeStream stream, const Format format, const uint32_t offset, const uint32_t stride, const VertexAttributeRate rate, const void* data) { ENSURE(rate == VertexAttributeRate::PER_VERTEX); switch (stream) { case VertexAttributeStream::POSITION: VertexPointer(format, stride, static_cast(data) + offset); break; case VertexAttributeStream::NORMAL: NormalPointer(format, stride, static_cast(data) + offset); break; case VertexAttributeStream::COLOR: ColorPointer(format, stride, static_cast(data) + offset); break; case VertexAttributeStream::UV0: FALLTHROUGH; case VertexAttributeStream::UV1: FALLTHROUGH; case VertexAttributeStream::UV2: FALLTHROUGH; case VertexAttributeStream::UV3: FALLTHROUGH; case VertexAttributeStream::UV4: FALLTHROUGH; case VertexAttributeStream::UV5: FALLTHROUGH; case VertexAttributeStream::UV6: FALLTHROUGH; case VertexAttributeStream::UV7: { const int indexOffset = static_cast(stream) - static_cast(VertexAttributeStream::UV0); TexCoordPointer(GL_TEXTURE0 + indexOffset, format, stride, static_cast(data) + offset); break; } default: debug_warn("Unsupported stream"); }; } void CShaderProgram::AssertPointersBound() { ENSURE((m_StreamFlags & ~m_ValidStreams) == 0); } } // namespace GL } // namespace Backend } // namespace Renderer Index: ps/trunk/source/renderer/backend/vulkan/DeviceCommandContext.cpp =================================================================== --- ps/trunk/source/renderer/backend/vulkan/DeviceCommandContext.cpp (revision 28009) +++ ps/trunk/source/renderer/backend/vulkan/DeviceCommandContext.cpp (revision 28010) @@ -1,1150 +1,1220 @@ -/* Copyright (C) 2023 Wildfire Games. +/* Copyright (C) 2024 Wildfire Games. * This file is part of 0 A.D. * * 0 A.D. is free software: you can redistribute it and/or modify * it under the terms of the GNU General Public License as published by * the Free Software Foundation, either version 2 of the License, or * (at your option) any later version. * * 0 A.D. is distributed in the hope that it will be useful, * but WITHOUT ANY WARRANTY; without even the implied warranty of * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the * GNU General Public License for more details. * * You should have received a copy of the GNU General Public License * along with 0 A.D. If not, see . */ #include "precompiled.h" #include "DeviceCommandContext.h" #include "lib/bits.h" #include "maths/MathUtil.h" #include "ps/CLogger.h" #include "ps/ConfigDB.h" #include "ps/containers/Span.h" #include "ps/containers/StaticVector.h" #include "renderer/backend/vulkan/Buffer.h" #include "renderer/backend/vulkan/DescriptorManager.h" #include "renderer/backend/vulkan/Device.h" #include "renderer/backend/vulkan/Framebuffer.h" #include "renderer/backend/vulkan/PipelineState.h" #include "renderer/backend/vulkan/RingCommandContext.h" #include "renderer/backend/vulkan/ShaderProgram.h" #include "renderer/backend/vulkan/Texture.h" #include "renderer/backend/vulkan/Utilities.h" #include #include #include namespace Renderer { namespace Backend { namespace Vulkan { namespace { constexpr uint32_t UNIFORM_BUFFER_INITIAL_SIZE = 1024 * 1024; constexpr uint32_t FRAME_INPLACE_BUFFER_INITIAL_SIZE = 128 * 1024; struct SBaseImageState { VkImageLayout layout = VK_IMAGE_LAYOUT_UNDEFINED; VkAccessFlags accessMask = 0; VkPipelineStageFlags stageMask = 0; }; SBaseImageState GetBaseImageState(CTexture* texture) { if (texture->GetUsage() & ITexture::Usage::SAMPLED) { return { VK_IMAGE_LAYOUT_SHADER_READ_ONLY_OPTIMAL, VK_ACCESS_SHADER_READ_BIT, - VK_PIPELINE_STAGE_FRAGMENT_SHADER_BIT}; + VK_PIPELINE_STAGE_FRAGMENT_SHADER_BIT | VK_PIPELINE_STAGE_COMPUTE_SHADER_BIT}; } else if (texture->GetUsage() & ITexture::Usage::COLOR_ATTACHMENT) { return { VK_IMAGE_LAYOUT_COLOR_ATTACHMENT_OPTIMAL, VK_ACCESS_COLOR_ATTACHMENT_READ_BIT | VK_ACCESS_COLOR_ATTACHMENT_WRITE_BIT, VK_PIPELINE_STAGE_COLOR_ATTACHMENT_OUTPUT_BIT}; } else if (texture->GetUsage() & ITexture::Usage::DEPTH_STENCIL_ATTACHMENT) { return { VK_IMAGE_LAYOUT_DEPTH_STENCIL_ATTACHMENT_OPTIMAL, VK_ACCESS_DEPTH_STENCIL_ATTACHMENT_READ_BIT | VK_ACCESS_DEPTH_STENCIL_ATTACHMENT_WRITE_BIT, VK_PIPELINE_STAGE_EARLY_FRAGMENT_TESTS_BIT | VK_PIPELINE_STAGE_LATE_FRAGMENT_TESTS_BIT}; } // The only TRANSFER_DST usage means we can do only readbacks. else if (texture->GetUsage() == ITexture::Usage::TRANSFER_DST) { return { VK_IMAGE_LAYOUT_GENERAL, VK_ACCESS_HOST_READ_BIT, VK_PIPELINE_STAGE_HOST_BIT}; } return {}; } class ScopedImageLayoutTransition { public: ScopedImageLayoutTransition( CRingCommandContext& commandContext, const PS::span textures, const VkImageLayout layout, const VkAccessFlags accessMask, const VkPipelineStageFlags stageMask) : m_CommandContext(commandContext), m_Textures(textures), m_Layout(layout), m_AccessMask(accessMask), m_StageMask(stageMask) { for (CTexture* const texture : m_Textures) { const auto state = GetBaseImageState(texture); VkImageLayout oldLayout = state.layout; if (!texture->IsInitialized()) oldLayout = VK_IMAGE_LAYOUT_UNDEFINED; Utilities::SetTextureLayout( m_CommandContext.GetCommandBuffer(), texture, oldLayout, m_Layout, state.accessMask, m_AccessMask, state.stageMask, m_StageMask); } } ~ScopedImageLayoutTransition() { for (CTexture* const texture : m_Textures) { const auto state = GetBaseImageState(texture); Utilities::SetTextureLayout( m_CommandContext.GetCommandBuffer(), texture, m_Layout, state.layout, m_AccessMask, state.accessMask, m_StageMask, state.stageMask); } } private: CRingCommandContext& m_CommandContext; const PS::span m_Textures; const VkImageLayout m_Layout = VK_IMAGE_LAYOUT_UNDEFINED; const VkAccessFlags m_AccessMask = 0; const VkPipelineStageFlags m_StageMask = 0; }; template void TransferForEachFramebufferAttachmentPair( CRingCommandContext& commandContext, CFramebuffer* sourceFramebuffer, CFramebuffer* destinationFramebuffer, TransferOp transferOp) { const auto& sourceColorAttachments = sourceFramebuffer->GetColorAttachments(); const auto& destinationColorAttachments = destinationFramebuffer->GetColorAttachments(); ENSURE(sourceColorAttachments.size() == destinationColorAttachments.size()); for (CTexture* sourceColorAttachment : sourceColorAttachments) ENSURE(sourceColorAttachment->GetUsage() & ITexture::Usage::TRANSFER_SRC); for (CTexture* destinationColorAttachment : destinationColorAttachments) ENSURE(destinationColorAttachment->GetUsage() & ITexture::Usage::TRANSFER_DST); // TODO: combine barriers, reduce duplication, add depth. ScopedImageLayoutTransition scopedColorAttachmentsTransition{ commandContext, {sourceColorAttachments.begin(), sourceColorAttachments.end()}, VK_IMAGE_LAYOUT_TRANSFER_SRC_OPTIMAL, VK_ACCESS_TRANSFER_READ_BIT, VK_PIPELINE_STAGE_TRANSFER_BIT}; ScopedImageLayoutTransition destinationColorAttachmentsTransition{ commandContext, {destinationColorAttachments.begin(), destinationColorAttachments.end()}, VK_IMAGE_LAYOUT_TRANSFER_DST_OPTIMAL, VK_ACCESS_TRANSFER_WRITE_BIT, VK_PIPELINE_STAGE_TRANSFER_BIT}; for (CFramebuffer::ColorAttachments::size_type index = 0; index < destinationColorAttachments.size(); ++index) { CTexture* sourceColorAttachment = sourceColorAttachments[index]; CTexture* destinationColorAttachment = destinationColorAttachments[index]; transferOp(commandContext.GetCommandBuffer(), sourceColorAttachment, destinationColorAttachment); } if (sourceFramebuffer->GetDepthStencilAttachment() && destinationFramebuffer->GetDepthStencilAttachment()) { transferOp( commandContext.GetCommandBuffer(), sourceFramebuffer->GetDepthStencilAttachment(), destinationFramebuffer->GetDepthStencilAttachment()); } } } // anonymous namespace // A helper class to store consequent uploads to avoid many copy functions. // We use a buffer in the device memory and NUMBER_OF_FRAMES_IN_FLIGHT // times bigger buffer in the host memory. class CDeviceCommandContext::CUploadRing { public: CUploadRing( CDevice* device, const IBuffer::Type type, const uint32_t initialCapacity); CBuffer* GetBuffer() { return m_Buffer.get(); } uint32_t ScheduleUpload( VkCommandBuffer commandBuffer, const PS::span data, const uint32_t alignment); void ExecuteUploads(VkCommandBuffer commandBuffer); private: void ResizeIfNeeded( VkCommandBuffer commandBuffer, const uint32_t dataSize); CDevice* m_Device = nullptr; IBuffer::Type m_Type = IBuffer::Type::VERTEX; uint32_t m_Capacity = 0; uint32_t m_BlockIndex = 0, m_BlockOffset = 0; std::unique_ptr m_Buffer, m_StagingBuffer; std::byte* m_StagingBufferMappedData = nullptr; }; CDeviceCommandContext::CUploadRing::CUploadRing( CDevice* device, const IBuffer::Type type, const uint32_t initialCapacity) : m_Device(device), m_Type(type) { ResizeIfNeeded(VK_NULL_HANDLE, initialCapacity); } void CDeviceCommandContext::CUploadRing::ResizeIfNeeded( VkCommandBuffer commandBuffer, const uint32_t dataSize) { const bool resizeNeeded = !m_Buffer || m_BlockOffset + dataSize > m_Capacity; if (!resizeNeeded) return; if (m_Buffer && m_BlockOffset > 0) { ENSURE(commandBuffer != VK_NULL_HANDLE); ExecuteUploads(commandBuffer); } m_Capacity = std::max(m_Capacity * 2, round_up_to_pow2(dataSize)); m_Buffer = m_Device->CreateCBuffer( "UploadRingBuffer", m_Type, m_Capacity, true); m_StagingBuffer = m_Device->CreateCBuffer( "UploadRingStagingBuffer", IBuffer::Type::UPLOAD, NUMBER_OF_FRAMES_IN_FLIGHT * m_Capacity, true); ENSURE(m_Buffer && m_StagingBuffer); m_StagingBufferMappedData = static_cast(m_StagingBuffer->GetMappedData()); ENSURE(m_StagingBufferMappedData); m_BlockIndex = 0; m_BlockOffset = 0; } uint32_t CDeviceCommandContext::CUploadRing::ScheduleUpload( VkCommandBuffer commandBuffer, const PS::span data, const uint32_t alignment) { ENSURE(data.size() > 0); ENSURE(is_pow2(alignment)); m_BlockOffset = (m_BlockOffset + alignment - 1) & ~(alignment - 1); ResizeIfNeeded(commandBuffer, data.size()); const uint32_t destination = m_BlockIndex * m_Capacity + m_BlockOffset; const uint32_t offset = m_BlockOffset; m_BlockOffset += data.size(); std::memcpy(m_StagingBufferMappedData + destination, data.data(), data.size()); return offset; } void CDeviceCommandContext::CUploadRing::ExecuteUploads( VkCommandBuffer commandBuffer) { if (m_BlockOffset == 0) return; const VkPipelineStageFlags stageMask = m_Type == IBuffer::Type::UNIFORM - ? VK_PIPELINE_STAGE_VERTEX_SHADER_BIT | VK_PIPELINE_STAGE_FRAGMENT_SHADER_BIT + ? VK_PIPELINE_STAGE_VERTEX_SHADER_BIT | VK_PIPELINE_STAGE_FRAGMENT_SHADER_BIT | VK_PIPELINE_STAGE_COMPUTE_SHADER_BIT : VK_PIPELINE_STAGE_VERTEX_INPUT_BIT; Utilities::SubmitBufferMemoryBarrier( commandBuffer, m_Buffer.get(), 0, VK_WHOLE_SIZE, VK_ACCESS_MEMORY_READ_BIT, VK_ACCESS_TRANSFER_WRITE_BIT, stageMask, VK_PIPELINE_STAGE_TRANSFER_BIT); VkBufferCopy region{}; region.srcOffset = m_BlockIndex * m_Capacity; region.dstOffset = 0; region.size = m_BlockOffset; vkCmdCopyBuffer( commandBuffer, m_StagingBuffer->GetVkBuffer(), m_Buffer->GetVkBuffer(), 1, ®ion); Utilities::SubmitBufferMemoryBarrier( commandBuffer, m_Buffer.get(), 0, VK_WHOLE_SIZE, VK_ACCESS_TRANSFER_WRITE_BIT, VK_ACCESS_MEMORY_READ_BIT, VK_PIPELINE_STAGE_TRANSFER_BIT, stageMask); m_BlockIndex = (m_BlockIndex + 1) % NUMBER_OF_FRAMES_IN_FLIGHT; m_BlockOffset = 0; } // static std::unique_ptr CDeviceCommandContext::Create(CDevice* device) { std::unique_ptr deviceCommandContext(new CDeviceCommandContext()); deviceCommandContext->m_Device = device; deviceCommandContext->m_DebugScopedLabels = device->GetCapabilities().debugScopedLabels; deviceCommandContext->m_PrependCommandContext = device->CreateRingCommandContext(NUMBER_OF_FRAMES_IN_FLIGHT); deviceCommandContext->m_CommandContext = device->CreateRingCommandContext(NUMBER_OF_FRAMES_IN_FLIGHT); deviceCommandContext->m_VertexUploadRing = std::make_unique( device, IBuffer::Type::VERTEX, FRAME_INPLACE_BUFFER_INITIAL_SIZE); deviceCommandContext->m_IndexUploadRing = std::make_unique( device, IBuffer::Type::INDEX, FRAME_INPLACE_BUFFER_INITIAL_SIZE); deviceCommandContext->m_UniformUploadRing = std::make_unique( device, IBuffer::Type::UNIFORM, UNIFORM_BUFFER_INITIAL_SIZE); // TODO: reduce the code duplication. VkDescriptorPoolSize descriptorPoolSize{}; descriptorPoolSize.type = VK_DESCRIPTOR_TYPE_UNIFORM_BUFFER_DYNAMIC; descriptorPoolSize.descriptorCount = 1; VkDescriptorPoolCreateInfo descriptorPoolCreateInfo{}; descriptorPoolCreateInfo.sType = VK_STRUCTURE_TYPE_DESCRIPTOR_POOL_CREATE_INFO; descriptorPoolCreateInfo.poolSizeCount = 1; descriptorPoolCreateInfo.pPoolSizes = &descriptorPoolSize; descriptorPoolCreateInfo.maxSets = 1; ENSURE_VK_SUCCESS(vkCreateDescriptorPool( device->GetVkDevice(), &descriptorPoolCreateInfo, nullptr, &deviceCommandContext->m_UniformDescriptorPool)); VkDescriptorSetAllocateInfo descriptorSetAllocateInfo{}; descriptorSetAllocateInfo.sType = VK_STRUCTURE_TYPE_DESCRIPTOR_SET_ALLOCATE_INFO; descriptorSetAllocateInfo.descriptorPool = deviceCommandContext->m_UniformDescriptorPool; descriptorSetAllocateInfo.descriptorSetCount = 1; descriptorSetAllocateInfo.pSetLayouts = &device->GetDescriptorManager().GetUniformDescriptorSetLayout(); ENSURE_VK_SUCCESS(vkAllocateDescriptorSets( device->GetVkDevice(), &descriptorSetAllocateInfo, &deviceCommandContext->m_UniformDescriptorSet)); CBuffer* uniformBuffer = deviceCommandContext->m_UniformUploadRing->GetBuffer(); ENSURE(uniformBuffer); // TODO: fix the hard-coded size. const VkDescriptorBufferInfo descriptorBufferInfos[1] = { {uniformBuffer->GetVkBuffer(), 0u, 512u} }; VkWriteDescriptorSet writeDescriptorSet{}; writeDescriptorSet.sType = VK_STRUCTURE_TYPE_WRITE_DESCRIPTOR_SET; writeDescriptorSet.dstSet = deviceCommandContext->m_UniformDescriptorSet; writeDescriptorSet.dstBinding = 0; writeDescriptorSet.dstArrayElement = 0; writeDescriptorSet.descriptorType = VK_DESCRIPTOR_TYPE_UNIFORM_BUFFER_DYNAMIC; writeDescriptorSet.descriptorCount = 1; writeDescriptorSet.pBufferInfo = descriptorBufferInfos; vkUpdateDescriptorSets( device->GetVkDevice(), 1, &writeDescriptorSet, 0, nullptr); CFG_GET_VAL( "renderer.backend.vulkan.debugbarrierafterframebufferpass", deviceCommandContext->m_DebugBarrierAfterFramebufferPass); return deviceCommandContext; } CDeviceCommandContext::CDeviceCommandContext() = default; CDeviceCommandContext::~CDeviceCommandContext() { VkDevice device = m_Device->GetVkDevice(); vkDeviceWaitIdle(device); if (m_UniformDescriptorPool != VK_NULL_HANDLE) vkDestroyDescriptorPool(device, m_UniformDescriptorPool, nullptr); } IDevice* CDeviceCommandContext::GetDevice() { return m_Device; } void CDeviceCommandContext::SetGraphicsPipelineState( IGraphicsPipelineState* pipelineState) { ENSURE(pipelineState); m_GraphicsPipelineState = pipelineState->As(); CShaderProgram* shaderProgram = m_GraphicsPipelineState->GetShaderProgram()->As(); if (m_ShaderProgram != shaderProgram) { if (m_ShaderProgram) m_ShaderProgram->Unbind(); m_ShaderProgram = shaderProgram; } m_IsPipelineStateDirty = true; } +void CDeviceCommandContext::SetComputePipelineState( + IComputePipelineState* pipelineState) +{ + if (m_ShaderProgram) + m_ShaderProgram->Unbind(); + + ENSURE(pipelineState); + CComputePipelineState* computePipelineState = pipelineState->As(); + m_ShaderProgram = computePipelineState->GetShaderProgram()->As(); + m_ShaderProgram->Bind(); + vkCmdBindPipeline( + m_CommandContext->GetCommandBuffer(), m_ShaderProgram->GetPipelineBindPoint(), computePipelineState->GetPipeline()); + + if (m_Device->GetDescriptorManager().UseDescriptorIndexing()) + { + vkCmdBindDescriptorSets( + m_CommandContext->GetCommandBuffer(), m_ShaderProgram->GetPipelineBindPoint(), + m_ShaderProgram->GetPipelineLayout(), 0, + 1, &m_Device->GetDescriptorManager().GetDescriptorIndexingSet(), 0, nullptr); + } +} + void CDeviceCommandContext::BlitFramebuffer( IFramebuffer* sourceFramebuffer, IFramebuffer* destinationFramebuffer, const Rect& sourceRegion, const Rect& destinationRegion, const Sampler::Filter filter) { ENSURE(!m_InsideFramebufferPass); ENSURE(sourceRegion.x >= 0 && sourceRegion.x + sourceRegion.width <= static_cast(sourceFramebuffer->GetWidth())); ENSURE(sourceRegion.y >= 0 && sourceRegion.y + sourceRegion.height <= static_cast(sourceFramebuffer->GetHeight())); ENSURE(destinationRegion.x >= 0 && destinationRegion.x + destinationRegion.width <= static_cast(destinationFramebuffer->GetWidth())); ENSURE(destinationRegion.y >= 0 && destinationRegion.y + destinationRegion.height <= static_cast(destinationFramebuffer->GetHeight())); TransferForEachFramebufferAttachmentPair( *m_CommandContext, sourceFramebuffer->As(), destinationFramebuffer->As(), [&sourceRegion, &destinationRegion, filter]( VkCommandBuffer commandBuffer, CTexture* sourceColorAttachment, CTexture* destinationColorAttachment) { // TODO: we need to check for VK_FORMAT_FEATURE_BLIT_*_BIT for used formats. const bool isDepth = IsDepthFormat(sourceColorAttachment->GetFormat()); const VkImageAspectFlags aspectMask = isDepth ? VK_IMAGE_ASPECT_DEPTH_BIT : VK_IMAGE_ASPECT_COLOR_BIT; VkImageBlit region{}; // Currently (0, 0) is the left-bottom corner (legacy from GL), so // we need to adjust the regions. const uint32_t sourceHeight = sourceColorAttachment->GetHeight(); const uint32_t destinationHeight = destinationColorAttachment->GetHeight(); region.srcOffsets[0].x = sourceRegion.x; region.srcOffsets[0].y = sourceHeight - sourceRegion.y - sourceRegion.height; region.dstOffsets[0].x = destinationRegion.x; region.dstOffsets[0].y = destinationHeight - destinationRegion.y - destinationRegion.height; region.srcOffsets[1].x = sourceRegion.x + sourceRegion.width; region.srcOffsets[1].y = sourceHeight - sourceRegion.y; region.srcOffsets[1].z = 1; region.dstOffsets[1].x = destinationRegion.x + destinationRegion.width; region.dstOffsets[1].y = destinationHeight - destinationRegion.y; region.dstOffsets[1].z = 1; region.srcSubresource.aspectMask = aspectMask; region.srcSubresource.mipLevel = 0; region.srcSubresource.baseArrayLayer = 0; region.srcSubresource.layerCount = 1; region.dstSubresource.aspectMask = aspectMask; region.dstSubresource.mipLevel = 0; region.dstSubresource.baseArrayLayer = 0; region.dstSubresource.layerCount = 1; vkCmdBlitImage( commandBuffer, sourceColorAttachment->GetImage(), VK_IMAGE_LAYOUT_TRANSFER_SRC_OPTIMAL, destinationColorAttachment->GetImage(), VK_IMAGE_LAYOUT_TRANSFER_DST_OPTIMAL, 1, ®ion, filter == Sampler::Filter::LINEAR ? VK_FILTER_LINEAR : VK_FILTER_NEAREST); }); } void CDeviceCommandContext::ResolveFramebuffer( IFramebuffer* sourceFramebuffer, IFramebuffer* destinationFramebuffer) { ENSURE(!m_InsideFramebufferPass); ENSURE(sourceFramebuffer->As()->GetSampleCount() > 1); ENSURE(destinationFramebuffer->As()->GetSampleCount() == 1); ENSURE(sourceFramebuffer->As()->GetWidth() == destinationFramebuffer->As()->GetWidth()); ENSURE(sourceFramebuffer->As()->GetHeight() == destinationFramebuffer->As()->GetHeight()); TransferForEachFramebufferAttachmentPair( *m_CommandContext, sourceFramebuffer->As(), destinationFramebuffer->As(), [](VkCommandBuffer commandBuffer, CTexture* sourceColorAttachment, CTexture* destinationColorAttachment) { ENSURE(sourceColorAttachment->GetFormat() == destinationColorAttachment->GetFormat()); ENSURE(!IsDepthFormat(sourceColorAttachment->GetFormat())); const VkImageAspectFlags aspectMask = VK_IMAGE_ASPECT_COLOR_BIT; VkImageResolve region{}; region.extent.width = sourceColorAttachment->GetWidth(); region.extent.height = sourceColorAttachment->GetHeight(); region.extent.depth = 1; region.srcSubresource.aspectMask = aspectMask; region.srcSubresource.mipLevel = 0; region.srcSubresource.baseArrayLayer = 0; region.srcSubresource.layerCount = 1; region.dstSubresource.aspectMask = aspectMask; region.dstSubresource.mipLevel = 0; region.dstSubresource.baseArrayLayer = 0; region.dstSubresource.layerCount = 1; vkCmdResolveImage( commandBuffer, sourceColorAttachment->GetImage(), VK_IMAGE_LAYOUT_TRANSFER_SRC_OPTIMAL, destinationColorAttachment->GetImage(), VK_IMAGE_LAYOUT_TRANSFER_DST_OPTIMAL, 1, ®ion); }); } void CDeviceCommandContext::ClearFramebuffer(const bool color, const bool depth, const bool stencil) { ENSURE(m_InsideFramebufferPass); ENSURE(m_Framebuffer); PS::StaticVector clearAttachments; if (color) { ENSURE(!m_Framebuffer->GetColorAttachments().empty()); for (size_t index = 0; index < m_Framebuffer->GetColorAttachments().size(); ++index) { VkClearAttachment clearAttachment{}; clearAttachment.aspectMask = VK_IMAGE_ASPECT_COLOR_BIT; const CColor& clearColor = m_Framebuffer->GetClearColor(); clearAttachment.clearValue.color.float32[0] = clearColor.r; clearAttachment.clearValue.color.float32[1] = clearColor.g; clearAttachment.clearValue.color.float32[2] = clearColor.b; clearAttachment.clearValue.color.float32[3] = clearColor.a; clearAttachment.colorAttachment = index; clearAttachments.emplace_back(std::move(clearAttachment)); } } if (depth || stencil) { ENSURE(m_Framebuffer->GetDepthStencilAttachment()); if (stencil) { const Format depthStencilFormat = m_Framebuffer->GetDepthStencilAttachment()->GetFormat(); ENSURE(depthStencilFormat == Format::D24_UNORM_S8_UINT || depthStencilFormat == Format::D32_SFLOAT_S8_UINT); } VkClearAttachment clearAttachment{}; if (depth) clearAttachment.aspectMask |= VK_IMAGE_ASPECT_DEPTH_BIT; if (stencil) clearAttachment.aspectMask |= VK_IMAGE_ASPECT_STENCIL_BIT; clearAttachment.clearValue.depthStencil.depth = 1.0f; clearAttachment.clearValue.depthStencil.stencil = 0; clearAttachments.emplace_back(std::move(clearAttachment)); } VkClearRect clearRect{}; clearRect.layerCount = 1; clearRect.rect.offset.x = 0; clearRect.rect.offset.y = 0; clearRect.rect.extent.width = m_Framebuffer->GetWidth(); clearRect.rect.extent.height = m_Framebuffer->GetHeight(); vkCmdClearAttachments( m_CommandContext->GetCommandBuffer(), clearAttachments.size(), clearAttachments.data(), 1, &clearRect); } void CDeviceCommandContext::BeginFramebufferPass(IFramebuffer* framebuffer) { + ENSURE(!m_InsideFramebufferPass); + ENSURE(!m_InsideComputePass); ENSURE(framebuffer); m_IsPipelineStateDirty = true; m_Framebuffer = framebuffer->As(); m_GraphicsPipelineState = nullptr; m_VertexInputLayout = nullptr; SetScissors(0, nullptr); for (CTexture* colorAttachment : m_Framebuffer->GetColorAttachments()) { if (!(colorAttachment->GetUsage() & ITexture::Usage::SAMPLED) && colorAttachment->IsInitialized()) continue; VkImageLayout oldLayout = VK_IMAGE_LAYOUT_SHADER_READ_ONLY_OPTIMAL; if (!colorAttachment->IsInitialized()) oldLayout = VK_IMAGE_LAYOUT_UNDEFINED; Utilities::SetTextureLayout( m_CommandContext->GetCommandBuffer(), colorAttachment, oldLayout, VK_IMAGE_LAYOUT_COLOR_ATTACHMENT_OPTIMAL, VK_ACCESS_SHADER_READ_BIT, VK_ACCESS_COLOR_ATTACHMENT_READ_BIT | VK_ACCESS_COLOR_ATTACHMENT_WRITE_BIT, - VK_PIPELINE_STAGE_FRAGMENT_SHADER_BIT, VK_PIPELINE_STAGE_COLOR_ATTACHMENT_OUTPUT_BIT); + VK_PIPELINE_STAGE_FRAGMENT_SHADER_BIT | VK_PIPELINE_STAGE_COMPUTE_SHADER_BIT, VK_PIPELINE_STAGE_COLOR_ATTACHMENT_OUTPUT_BIT); } CTexture* depthStencilAttachment = m_Framebuffer->GetDepthStencilAttachment(); if (depthStencilAttachment && ((depthStencilAttachment->GetUsage() & ITexture::Usage::SAMPLED) || !depthStencilAttachment->IsInitialized())) { VkImageLayout oldLayout = VK_IMAGE_LAYOUT_SHADER_READ_ONLY_OPTIMAL; if (!depthStencilAttachment->IsInitialized()) oldLayout = VK_IMAGE_LAYOUT_UNDEFINED; Utilities::SetTextureLayout( m_CommandContext->GetCommandBuffer(), depthStencilAttachment, oldLayout, VK_IMAGE_LAYOUT_DEPTH_STENCIL_ATTACHMENT_OPTIMAL, VK_ACCESS_SHADER_READ_BIT, VK_ACCESS_DEPTH_STENCIL_ATTACHMENT_READ_BIT | VK_ACCESS_DEPTH_STENCIL_ATTACHMENT_WRITE_BIT, - VK_PIPELINE_STAGE_FRAGMENT_SHADER_BIT, + VK_PIPELINE_STAGE_FRAGMENT_SHADER_BIT | VK_PIPELINE_STAGE_COMPUTE_SHADER_BIT, VK_PIPELINE_STAGE_FRAGMENT_SHADER_BIT | VK_PIPELINE_STAGE_EARLY_FRAGMENT_TESTS_BIT | VK_PIPELINE_STAGE_LATE_FRAGMENT_TESTS_BIT); } m_InsideFramebufferPass = true; VkRenderPassBeginInfo renderPassBeginInfo{}; renderPassBeginInfo.sType = VK_STRUCTURE_TYPE_RENDER_PASS_BEGIN_INFO; renderPassBeginInfo.renderPass = m_Framebuffer->GetRenderPass(); renderPassBeginInfo.framebuffer = m_Framebuffer->GetFramebuffer(); renderPassBeginInfo.renderArea.offset = { 0, 0 }; renderPassBeginInfo.renderArea.extent = { m_Framebuffer->GetWidth(), m_Framebuffer->GetHeight() }; PS::StaticVector clearValues; const bool needsClearValues = m_Framebuffer->GetColorAttachmentLoadOp() == AttachmentLoadOp::CLEAR || (m_Framebuffer->GetDepthStencilAttachment() && m_Framebuffer->GetDepthStencilAttachmentLoadOp() == AttachmentLoadOp::CLEAR); if (needsClearValues) { for (CTexture* colorAttachment : m_Framebuffer->GetColorAttachments()) { UNUSED2(colorAttachment); const CColor& clearColor = m_Framebuffer->GetClearColor(); // The four array elements of the clear color map to R, G, B, and A // components of image formats, in order. clearValues.emplace_back(); clearValues.back().color.float32[0] = clearColor.r; clearValues.back().color.float32[1] = clearColor.g; clearValues.back().color.float32[2] = clearColor.b; clearValues.back().color.float32[3] = clearColor.a; } if (m_Framebuffer->GetDepthStencilAttachment()) { clearValues.emplace_back(); clearValues.back().depthStencil.depth = 1.0f; clearValues.back().depthStencil.stencil = 0; } renderPassBeginInfo.clearValueCount = clearValues.size(); renderPassBeginInfo.pClearValues = clearValues.data(); } vkCmdBeginRenderPass(m_CommandContext->GetCommandBuffer(), &renderPassBeginInfo, VK_SUBPASS_CONTENTS_INLINE); } void CDeviceCommandContext::EndFramebufferPass() { ENSURE(m_InsideFramebufferPass); vkCmdEndRenderPass(m_CommandContext->GetCommandBuffer()); m_InsideFramebufferPass = false; m_BoundIndexBuffer = nullptr; ENSURE(m_Framebuffer); for (CTexture* colorAttachment : m_Framebuffer->GetColorAttachments()) { if (!(colorAttachment->GetUsage() & ITexture::Usage::SAMPLED)) continue; Utilities::SetTextureLayout( m_CommandContext->GetCommandBuffer(), colorAttachment, VK_IMAGE_LAYOUT_COLOR_ATTACHMENT_OPTIMAL, VK_IMAGE_LAYOUT_SHADER_READ_ONLY_OPTIMAL, VK_ACCESS_COLOR_ATTACHMENT_READ_BIT | VK_ACCESS_COLOR_ATTACHMENT_WRITE_BIT, VK_ACCESS_SHADER_READ_BIT, - VK_PIPELINE_STAGE_COLOR_ATTACHMENT_OUTPUT_BIT, VK_PIPELINE_STAGE_FRAGMENT_SHADER_BIT); + VK_PIPELINE_STAGE_COLOR_ATTACHMENT_OUTPUT_BIT, VK_PIPELINE_STAGE_FRAGMENT_SHADER_BIT | VK_PIPELINE_STAGE_COMPUTE_SHADER_BIT); } CTexture* depthStencilAttachment = m_Framebuffer->GetDepthStencilAttachment(); if (depthStencilAttachment && (depthStencilAttachment->GetUsage() & ITexture::Usage::SAMPLED)) { Utilities::SetTextureLayout( m_CommandContext->GetCommandBuffer(), depthStencilAttachment, VK_IMAGE_LAYOUT_DEPTH_STENCIL_ATTACHMENT_OPTIMAL, VK_IMAGE_LAYOUT_SHADER_READ_ONLY_OPTIMAL, VK_ACCESS_DEPTH_STENCIL_ATTACHMENT_READ_BIT | VK_ACCESS_DEPTH_STENCIL_ATTACHMENT_WRITE_BIT, VK_ACCESS_SHADER_READ_BIT, VK_PIPELINE_STAGE_FRAGMENT_SHADER_BIT | VK_PIPELINE_STAGE_EARLY_FRAGMENT_TESTS_BIT | VK_PIPELINE_STAGE_LATE_FRAGMENT_TESTS_BIT, - VK_PIPELINE_STAGE_FRAGMENT_SHADER_BIT); + VK_PIPELINE_STAGE_FRAGMENT_SHADER_BIT | VK_PIPELINE_STAGE_COMPUTE_SHADER_BIT); } m_LastBoundPipeline = VK_NULL_HANDLE; if (m_ShaderProgram) m_ShaderProgram->Unbind(); m_ShaderProgram = nullptr; if (m_DebugBarrierAfterFramebufferPass) Utilities::SubmitDebugSyncMemoryBarrier(m_CommandContext->GetCommandBuffer()); } void CDeviceCommandContext::ReadbackFramebufferSync( const uint32_t x, const uint32_t y, const uint32_t width, const uint32_t height, void* data) { CTexture* texture = m_Device->GetCurrentBackbufferTexture(); if (!texture) { LOGERROR("Vulkan: backbuffer is unavailable."); return; } if (!(texture->GetUsage() & ITexture::Usage::TRANSFER_SRC)) { LOGERROR("Vulkan: backbuffer doesn't support readback."); return; } m_QueuedReadbacks.emplace_back(x, y, width, height, data); } void CDeviceCommandContext::UploadTexture(ITexture* texture, const Format dataFormat, const void* data, const size_t dataSize, const uint32_t level, const uint32_t layer) { (m_InsideFramebufferPass ? m_PrependCommandContext : m_CommandContext)->ScheduleUpload( texture->As(), dataFormat, data, dataSize, level, layer); } void CDeviceCommandContext::UploadTextureRegion(ITexture* texture, const Format dataFormat, const void* data, const size_t dataSize, const uint32_t xOffset, const uint32_t yOffset, const uint32_t width, const uint32_t height, const uint32_t level, const uint32_t layer) { (m_InsideFramebufferPass ? m_PrependCommandContext : m_CommandContext)->ScheduleUpload( texture->As(), dataFormat, data, dataSize, xOffset, yOffset, width, height, level, layer); } void CDeviceCommandContext::UploadBuffer(IBuffer* buffer, const void* data, const uint32_t dataSize) { ENSURE(!m_InsideFramebufferPass); m_CommandContext->ScheduleUpload( buffer->As(), data, 0, dataSize); } void CDeviceCommandContext::UploadBuffer(IBuffer* buffer, const UploadBufferFunction& uploadFunction) { ENSURE(!m_InsideFramebufferPass); m_CommandContext->ScheduleUpload( buffer->As(), 0, buffer->As()->GetSize(), uploadFunction); } void CDeviceCommandContext::UploadBufferRegion( IBuffer* buffer, const void* data, const uint32_t dataOffset, const uint32_t dataSize) { ENSURE(!m_InsideFramebufferPass); m_CommandContext->ScheduleUpload( buffer->As(), data, dataOffset, dataSize); } void CDeviceCommandContext::UploadBufferRegion( IBuffer* buffer, const uint32_t dataOffset, const uint32_t dataSize, const UploadBufferFunction& uploadFunction) { m_CommandContext->ScheduleUpload( buffer->As(), dataOffset, dataSize, uploadFunction); } void CDeviceCommandContext::SetScissors(const uint32_t scissorCount, const Rect* scissors) { ENSURE(m_Framebuffer); ENSURE(scissorCount <= 1); VkRect2D scissor{}; if (scissorCount == 1) { // the x and y members of offset member of any element of pScissors must be // greater than or equal to 0. int32_t x = scissors[0].x; int32_t y = m_Framebuffer->GetHeight() - scissors[0].y - scissors[0].height; int32_t width = scissors[0].width; int32_t height = scissors[0].height; if (x < 0) { width = std::max(0, width + x); x = 0; } if (y < 0) { height = std::max(0, height + y); y = 0; } scissor.offset.x = x; scissor.offset.y = y; scissor.extent.width = width; scissor.extent.height = height; } else { scissor.extent.width = m_Framebuffer->GetWidth(); scissor.extent.height = m_Framebuffer->GetHeight(); } vkCmdSetScissor(m_CommandContext->GetCommandBuffer(), 0, 1, &scissor); } void CDeviceCommandContext::SetViewports(const uint32_t viewportCount, const Rect* viewports) { ENSURE(m_Framebuffer); ENSURE(viewportCount == 1); VkViewport viewport{}; viewport.minDepth = 0.0f; viewport.maxDepth = 1.0f; viewport.x = static_cast(viewports[0].x); viewport.y = static_cast(static_cast(m_Framebuffer->GetHeight()) - viewports[0].y - viewports[0].height); viewport.width = static_cast(viewports[0].width); viewport.height = static_cast(viewports[0].height); vkCmdSetViewport(m_CommandContext->GetCommandBuffer(), 0, 1, &viewport); } void CDeviceCommandContext::SetVertexInputLayout( IVertexInputLayout* vertexInputLayout) { ENSURE(vertexInputLayout); m_IsPipelineStateDirty = true; m_VertexInputLayout = vertexInputLayout->As(); } void CDeviceCommandContext::SetVertexBuffer( const uint32_t bindingSlot, IBuffer* buffer, const uint32_t offset) { BindVertexBuffer(bindingSlot, buffer->As(), offset); } void CDeviceCommandContext::SetVertexBufferData( const uint32_t bindingSlot, const void* data, const uint32_t dataSize) { // TODO: check vertex buffer alignment. const uint32_t ALIGNMENT = 32; const uint32_t offset = m_VertexUploadRing->ScheduleUpload( m_PrependCommandContext->GetCommandBuffer(), PS::span{static_cast(data), dataSize}, ALIGNMENT); BindVertexBuffer(bindingSlot, m_VertexUploadRing->GetBuffer(), offset); } void CDeviceCommandContext::SetIndexBuffer(IBuffer* buffer) { BindIndexBuffer(buffer->As(), 0); } void CDeviceCommandContext::SetIndexBufferData( const void* data, const uint32_t dataSize) { // TODO: check index buffer alignment. const uint32_t ALIGNMENT = 32; const uint32_t offset = m_IndexUploadRing->ScheduleUpload( m_PrependCommandContext->GetCommandBuffer(), PS::span{static_cast(data), dataSize}, ALIGNMENT); BindIndexBuffer(m_IndexUploadRing->GetBuffer(), offset); } void CDeviceCommandContext::BeginPass() { ENSURE(m_InsideFramebufferPass); m_InsidePass = true; } void CDeviceCommandContext::EndPass() { ENSURE(m_InsidePass); m_InsidePass = false; } void CDeviceCommandContext::Draw(const uint32_t firstVertex, const uint32_t vertexCount) { PreDraw(); vkCmdDraw(m_CommandContext->GetCommandBuffer(), vertexCount, 1, firstVertex, 0); } void CDeviceCommandContext::DrawIndexed( const uint32_t firstIndex, const uint32_t indexCount, const int32_t vertexOffset) { ENSURE(vertexOffset == 0); PreDraw(); vkCmdDrawIndexed(m_CommandContext->GetCommandBuffer(), indexCount, 1, firstIndex, 0, 0); } void CDeviceCommandContext::DrawInstanced( const uint32_t firstVertex, const uint32_t vertexCount, const uint32_t firstInstance, const uint32_t instanceCount) { PreDraw(); vkCmdDraw( m_CommandContext->GetCommandBuffer(), vertexCount, instanceCount, firstVertex, firstInstance); } void CDeviceCommandContext::DrawIndexedInstanced( const uint32_t firstIndex, const uint32_t indexCount, const uint32_t firstInstance, const uint32_t instanceCount, const int32_t vertexOffset) { PreDraw(); vkCmdDrawIndexed( m_CommandContext->GetCommandBuffer(), indexCount, instanceCount, firstIndex, vertexOffset, firstInstance); } void CDeviceCommandContext::DrawIndexedInRange( const uint32_t firstIndex, const uint32_t indexCount, const uint32_t UNUSED(start), const uint32_t UNUSED(end)) { DrawIndexed(firstIndex, indexCount, 0); } +void CDeviceCommandContext::BeginComputePass() +{ + ENSURE(!m_InsideFramebufferPass); + ENSURE(!m_InsideComputePass); + m_InsideComputePass = true; +} + +void CDeviceCommandContext::EndComputePass() +{ + if (m_ShaderProgram) + { + m_ShaderProgram->Unbind(); + m_ShaderProgram = nullptr; + } + + ENSURE(m_InsideComputePass); + m_InsideComputePass = false; +} + +void CDeviceCommandContext::Dispatch( + const uint32_t groupCountX, + const uint32_t groupCountY, + const uint32_t groupCountZ) +{ + ENSURE(m_InsideComputePass); + m_ShaderProgram->PreDispatch(*m_CommandContext); + UpdateOutdatedConstants(); + vkCmdDispatch( + m_CommandContext->GetCommandBuffer(), groupCountX, groupCountY, groupCountZ); + m_ShaderProgram->PostDispatch(*m_CommandContext); +} + void CDeviceCommandContext::SetTexture(const int32_t bindingSlot, ITexture* texture) { if (bindingSlot < 0) return; - ENSURE(m_InsidePass); + ENSURE(m_InsidePass || m_InsideComputePass); ENSURE(texture); CTexture* textureToBind = texture->As(); ENSURE(textureToBind->GetUsage() & ITexture::Usage::SAMPLED); - if (!m_Device->GetDescriptorManager().UseDescriptorIndexing()) + if (!m_Device->GetDescriptorManager().UseDescriptorIndexing() && m_InsidePass) { // We can't bind textures which are used as attachments. const auto& colorAttachments = m_Framebuffer->GetColorAttachments(); ENSURE(std::find( colorAttachments.begin(), colorAttachments.end(), textureToBind) == colorAttachments.end()); ENSURE(m_Framebuffer->GetDepthStencilAttachment() != textureToBind); ENSURE(textureToBind->IsInitialized()); } m_ShaderProgram->SetTexture(bindingSlot, textureToBind); } +void CDeviceCommandContext::SetStorageTexture(const int32_t bindingSlot, ITexture* texture) +{ + ENSURE(m_InsidePass || m_InsideComputePass); + ENSURE(texture); + CTexture* textureToBind = texture->As(); + ENSURE(textureToBind->GetUsage() & ITexture::Usage::STORAGE); + m_ShaderProgram->SetStorageTexture(bindingSlot, textureToBind); +} + void CDeviceCommandContext::SetUniform( const int32_t bindingSlot, const float value) { - ENSURE(m_InsidePass); + ENSURE(m_InsidePass || m_InsideComputePass); m_ShaderProgram->SetUniform(bindingSlot, value); } void CDeviceCommandContext::SetUniform( const int32_t bindingSlot, const float valueX, const float valueY) { - ENSURE(m_InsidePass); + ENSURE(m_InsidePass || m_InsideComputePass); m_ShaderProgram->SetUniform(bindingSlot, valueX, valueY); } void CDeviceCommandContext::SetUniform( const int32_t bindingSlot, const float valueX, const float valueY, const float valueZ) { - ENSURE(m_InsidePass); + ENSURE(m_InsidePass || m_InsideComputePass); m_ShaderProgram->SetUniform(bindingSlot, valueX, valueY, valueZ); } void CDeviceCommandContext::SetUniform( const int32_t bindingSlot, const float valueX, const float valueY, const float valueZ, const float valueW) { - ENSURE(m_InsidePass); + ENSURE(m_InsidePass || m_InsideComputePass); m_ShaderProgram->SetUniform(bindingSlot, valueX, valueY, valueZ, valueW); } void CDeviceCommandContext::SetUniform( const int32_t bindingSlot, PS::span values) { - ENSURE(m_InsidePass); + ENSURE(m_InsidePass || m_InsideComputePass); m_ShaderProgram->SetUniform(bindingSlot, values); } void CDeviceCommandContext::BeginScopedLabel(const char* name) { if (!m_DebugScopedLabels) return; VkDebugUtilsLabelEXT label{}; label.sType = VK_STRUCTURE_TYPE_DEBUG_UTILS_LABEL_EXT; label.pLabelName = name; vkCmdBeginDebugUtilsLabelEXT(m_CommandContext->GetCommandBuffer(), &label); } void CDeviceCommandContext::EndScopedLabel() { if (!m_DebugScopedLabels) return; vkCmdEndDebugUtilsLabelEXT(m_CommandContext->GetCommandBuffer()); } void CDeviceCommandContext::Flush() { ENSURE(!m_InsideFramebufferPass); // TODO: fix unsafe copying when overlaping flushes/frames. m_VertexUploadRing->ExecuteUploads(m_PrependCommandContext->GetCommandBuffer()); m_IndexUploadRing->ExecuteUploads(m_PrependCommandContext->GetCommandBuffer()); m_UniformUploadRing->ExecuteUploads(m_PrependCommandContext->GetCommandBuffer()); m_IsPipelineStateDirty = true; CTexture* backbufferReadbackTexture = m_QueuedReadbacks.empty() ? nullptr : m_Device->GetOrCreateBackbufferReadbackTexture(); const bool needsReadback = backbufferReadbackTexture; if (needsReadback) { CTexture* backbufferTexture = m_Device->GetCurrentBackbufferTexture(); { // We assume that the readback texture is in linear tiling. ScopedImageLayoutTransition scopedBackbufferTransition{ *m_CommandContext, {&backbufferTexture, 1}, VK_IMAGE_LAYOUT_TRANSFER_SRC_OPTIMAL, VK_ACCESS_TRANSFER_READ_BIT, VK_PIPELINE_STAGE_TRANSFER_BIT}; ScopedImageLayoutTransition scopedReadbackBackbufferTransition{ *m_CommandContext, {&backbufferReadbackTexture, 1}, VK_IMAGE_LAYOUT_TRANSFER_DST_OPTIMAL, VK_ACCESS_TRANSFER_WRITE_BIT, VK_PIPELINE_STAGE_TRANSFER_BIT}; VkImageCopy region{}; region.srcSubresource.aspectMask = VK_IMAGE_ASPECT_COLOR_BIT; region.srcSubresource.layerCount = 1; region.dstSubresource.aspectMask = VK_IMAGE_ASPECT_COLOR_BIT; region.dstSubresource.layerCount = 1; region.extent.width = backbufferTexture->GetWidth(); region.extent.height = backbufferTexture->GetHeight(); region.extent.depth = 1; vkCmdCopyImage( m_CommandContext->GetCommandBuffer(), backbufferTexture->GetImage(), VK_IMAGE_LAYOUT_TRANSFER_SRC_OPTIMAL, backbufferReadbackTexture->GetImage(), VK_IMAGE_LAYOUT_TRANSFER_DST_OPTIMAL, 1, ®ion); } Utilities::SubmitMemoryBarrier( m_CommandContext->GetCommandBuffer(), VK_ACCESS_TRANSFER_WRITE_BIT, VK_ACCESS_HOST_READ_BIT, VK_PIPELINE_STAGE_TRANSFER_BIT, VK_PIPELINE_STAGE_HOST_BIT); m_PrependCommandContext->Flush(); m_CommandContext->FlushAndWait(); VkImageSubresource subresource{}; subresource.aspectMask = VK_IMAGE_ASPECT_COLOR_BIT; VkSubresourceLayout subresourceLayout{}; vkGetImageSubresourceLayout( m_Device->GetVkDevice(), backbufferReadbackTexture->GetImage(), &subresource, &subresourceLayout); void* mappedData = backbufferReadbackTexture->GetMappedData(); ENSURE(mappedData); const uint32_t height = backbufferReadbackTexture->GetHeight(); const auto [redOffset, greenOffset, blueOffset] = backbufferReadbackTexture->GetFormat() == Format::B8G8R8A8_UNORM ? std::make_tuple(2, 1, 0) : std::make_tuple(0, 1, 2); for (const QueuedReadback& queuedReackback : m_QueuedReadbacks) { const std::byte* data = static_cast(mappedData); // Currently the backbuffer (0, 0) is the left-bottom corner (legacy from GL). data += subresourceLayout.offset + subresourceLayout.rowPitch * (height - queuedReackback.height - queuedReackback.y); for (uint32_t y = 0; y < queuedReackback.height; ++y) { const std::byte* row = data; for (uint32_t x = 0; x < queuedReackback.width; ++x) { const uint32_t sourceIndex = (queuedReackback.x + x) * 4; const uint32_t destinationIndex = ((queuedReackback.height - y - 1) * queuedReackback.width + x) * 3; std::byte* destinationPixelData = static_cast(queuedReackback.data) + destinationIndex; destinationPixelData[0] = row[sourceIndex + redOffset]; destinationPixelData[1] = row[sourceIndex + greenOffset]; destinationPixelData[2] = row[sourceIndex + blueOffset]; } data += subresourceLayout.rowPitch; } } } else { m_PrependCommandContext->Flush(); m_CommandContext->Flush(); } m_QueuedReadbacks.clear(); } void CDeviceCommandContext::PreDraw() { ENSURE(m_InsidePass); ApplyPipelineStateIfDirty(); m_ShaderProgram->PreDraw(*m_CommandContext); + UpdateOutdatedConstants(); +} + +void CDeviceCommandContext::UpdateOutdatedConstants() +{ if (m_ShaderProgram->IsMaterialConstantsDataOutdated()) { const VkDeviceSize alignment = std::max(static_cast(16), m_Device->GetChoosenPhysicalDevice().properties.limits.minUniformBufferOffsetAlignment); const uint32_t offset = m_UniformUploadRing->ScheduleUpload( m_PrependCommandContext->GetCommandBuffer(), PS::span{ m_ShaderProgram->GetMaterialConstantsData(), m_ShaderProgram->GetMaterialConstantsDataSize()}, alignment); m_ShaderProgram->UpdateMaterialConstantsData(); // TODO: maybe move inside shader program to reduce the # of bind calls. vkCmdBindDescriptorSets( m_CommandContext->GetCommandBuffer(), m_ShaderProgram->GetPipelineBindPoint(), m_ShaderProgram->GetPipelineLayout(), m_Device->GetDescriptorManager().GetUniformSet(), 1, &m_UniformDescriptorSet, 1, &offset); } } void CDeviceCommandContext::ApplyPipelineStateIfDirty() { if (!m_IsPipelineStateDirty) return; m_IsPipelineStateDirty = false; ENSURE(m_GraphicsPipelineState); ENSURE(m_VertexInputLayout); ENSURE(m_Framebuffer); VkPipeline pipeline = m_GraphicsPipelineState->GetOrCreatePipeline( m_VertexInputLayout, m_Framebuffer); ENSURE(pipeline != VK_NULL_HANDLE); if (m_LastBoundPipeline != pipeline) { m_LastBoundPipeline = pipeline; vkCmdBindPipeline(m_CommandContext->GetCommandBuffer(), m_ShaderProgram->GetPipelineBindPoint(), pipeline); m_ShaderProgram->Bind(); if (m_Device->GetDescriptorManager().UseDescriptorIndexing()) { vkCmdBindDescriptorSets( m_CommandContext->GetCommandBuffer(), m_ShaderProgram->GetPipelineBindPoint(), m_ShaderProgram->GetPipelineLayout(), 0, 1, &m_Device->GetDescriptorManager().GetDescriptorIndexingSet(), 0, nullptr); } } } void CDeviceCommandContext::BindVertexBuffer( const uint32_t bindingSlot, CBuffer* buffer, uint32_t offset) { VkBuffer vertexBuffers[] = { buffer->GetVkBuffer() }; VkDeviceSize offsets[] = { offset }; vkCmdBindVertexBuffers( m_CommandContext->GetCommandBuffer(), bindingSlot, std::size(vertexBuffers), vertexBuffers, offsets); } void CDeviceCommandContext::BindIndexBuffer(CBuffer* buffer, uint32_t offset) { if (buffer == m_BoundIndexBuffer && offset == m_BoundIndexBufferOffset) return; m_BoundIndexBuffer = buffer; m_BoundIndexBufferOffset = offset; vkCmdBindIndexBuffer( m_CommandContext->GetCommandBuffer(), buffer->GetVkBuffer(), offset, VK_INDEX_TYPE_UINT16); } } // namespace Vulkan } // namespace Backend } // namespace Renderer Index: ps/trunk/source/renderer/backend/vulkan/RingCommandContext.cpp =================================================================== --- ps/trunk/source/renderer/backend/vulkan/RingCommandContext.cpp (revision 28009) +++ ps/trunk/source/renderer/backend/vulkan/RingCommandContext.cpp (revision 28010) @@ -1,442 +1,442 @@ -/* Copyright (C) 2023 Wildfire Games. +/* Copyright (C) 2024 Wildfire Games. * This file is part of 0 A.D. * * 0 A.D. is free software: you can redistribute it and/or modify * it under the terms of the GNU General Public License as published by * the Free Software Foundation, either version 2 of the License, or * (at your option) any later version. * * 0 A.D. is distributed in the hope that it will be useful, * but WITHOUT ANY WARRANTY; without even the implied warranty of * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the * GNU General Public License for more details. * * You should have received a copy of the GNU General Public License * along with 0 A.D. If not, see . */ #include "precompiled.h" #include "RingCommandContext.h" #include "lib/bits.h" #include "renderer/backend/vulkan/Buffer.h" #include "renderer/backend/vulkan/Device.h" #include "renderer/backend/vulkan/Utilities.h" #include #include #include namespace Renderer { namespace Backend { namespace Vulkan { namespace { constexpr uint32_t INITIAL_STAGING_BUFFER_CAPACITY = 1024 * 1024; constexpr VkDeviceSize SMALL_HOST_TOTAL_MEMORY_THRESHOLD = 1024 * 1024 * 1024; constexpr uint32_t MAX_SMALL_STAGING_BUFFER_CAPACITY = 64 * 1024 * 1024; constexpr uint32_t MAX_STAGING_BUFFER_CAPACITY = 256 * 1024 * 1024; constexpr uint32_t INVALID_OFFSET = std::numeric_limits::max(); } // anonymous namespace CRingCommandContext::CRingCommandContext( CDevice* device, const size_t size, const uint32_t queueFamilyIndex, CSubmitScheduler& submitScheduler) : m_Device(device), m_SubmitScheduler(submitScheduler) { ENSURE(m_Device); m_OptimalBufferCopyOffsetAlignment = std::max( 1u, static_cast(m_Device->GetChoosenPhysicalDevice().properties.limits.optimalBufferCopyOffsetAlignment)); // In case of small amount of host memory it's better to make uploading // slower rather than crashing due to OOM, because memory for a // staging buffer is allocated in the host memory. m_MaxStagingBufferCapacity = m_Device->GetChoosenPhysicalDevice().hostTotalMemory <= SMALL_HOST_TOTAL_MEMORY_THRESHOLD ? MAX_SMALL_STAGING_BUFFER_CAPACITY : MAX_STAGING_BUFFER_CAPACITY; m_Ring.resize(size); for (RingItem& item : m_Ring) { VkCommandPoolCreateInfo commandPoolCreateInfoInfo{}; commandPoolCreateInfoInfo.sType = VK_STRUCTURE_TYPE_COMMAND_POOL_CREATE_INFO; commandPoolCreateInfoInfo.queueFamilyIndex = queueFamilyIndex; ENSURE_VK_SUCCESS(vkCreateCommandPool( m_Device->GetVkDevice(), &commandPoolCreateInfoInfo, nullptr, &item.commandPool)); VkCommandBufferAllocateInfo allocateInfo{}; allocateInfo.sType = VK_STRUCTURE_TYPE_COMMAND_BUFFER_ALLOCATE_INFO; allocateInfo.commandPool = item.commandPool; allocateInfo.level = VK_COMMAND_BUFFER_LEVEL_PRIMARY; allocateInfo.commandBufferCount = 1; ENSURE_VK_SUCCESS(vkAllocateCommandBuffers( m_Device->GetVkDevice(), &allocateInfo, &item.commandBuffer)); device->SetObjectName( VK_OBJECT_TYPE_COMMAND_BUFFER, item.commandBuffer, "RingCommandBuffer"); } } CRingCommandContext::~CRingCommandContext() { VkDevice device = m_Device->GetVkDevice(); for (RingItem& item : m_Ring) { if (item.commandBuffer != VK_NULL_HANDLE) vkFreeCommandBuffers(device, item.commandPool, 1, &item.commandBuffer); if (item.commandPool != VK_NULL_HANDLE) vkDestroyCommandPool(device, item.commandPool, nullptr); } } VkCommandBuffer CRingCommandContext::GetCommandBuffer() { RingItem& item = m_Ring[m_RingIndex]; if (!item.isBegan) Begin(); return item.commandBuffer; } void CRingCommandContext::Flush() { RingItem& item = m_Ring[m_RingIndex]; if (!item.isBegan) return; End(); item.handle = m_SubmitScheduler.Submit(item.commandBuffer); m_RingIndex = (m_RingIndex + 1) % m_Ring.size(); } void CRingCommandContext::FlushAndWait() { RingItem& item = m_Ring[m_RingIndex]; ENSURE(item.isBegan); End(); item.handle = m_SubmitScheduler.Submit(item.commandBuffer); WaitUntilFree(item); } void CRingCommandContext::ScheduleUpload( CTexture* texture, const Format dataFormat, const void* data, const size_t dataSize, const uint32_t level, const uint32_t layer) { const uint32_t mininumSize = 1u; const uint32_t width = std::max(mininumSize, texture->GetWidth() >> level); const uint32_t height = std::max(mininumSize, texture->GetHeight() >> level); ScheduleUpload( texture, dataFormat, data, dataSize, 0, 0, width, height, level, layer); } void CRingCommandContext::ScheduleUpload( CTexture* texture, const Format UNUSED(dataFormat), const void* data, const size_t dataSize, const uint32_t xOffset, const uint32_t yOffset, const uint32_t width, const uint32_t height, const uint32_t level, const uint32_t layer) { ENSURE(texture->GetType() != ITexture::Type::TEXTURE_2D_MULTISAMPLE); const Format format = texture->GetFormat(); if (texture->GetType() != ITexture::Type::TEXTURE_CUBE) ENSURE(layer == 0); ENSURE(format != Format::R8G8B8_UNORM); const bool isCompressedFormat = format == Format::BC1_RGB_UNORM || format == Format::BC1_RGBA_UNORM || format == Format::BC2_UNORM || format == Format::BC3_UNORM; ENSURE( format == Format::R8_UNORM || format == Format::R8G8_UNORM || format == Format::R8G8B8A8_UNORM || format == Format::A8_UNORM || format == Format::L8_UNORM || isCompressedFormat); // TODO: use a more precise format alignment. constexpr uint32_t formatAlignment = 16; const uint32_t offset = AcquireFreeSpace(dataSize, std::max(formatAlignment, m_OptimalBufferCopyOffsetAlignment)); std::memcpy(static_cast(m_StagingBuffer->GetMappedData()) + offset, data, dataSize); VkCommandBuffer commandBuffer = GetCommandBuffer(); VkImage image = texture->GetImage(); Utilities::SubmitImageMemoryBarrier( commandBuffer, image, level, layer, VK_ACCESS_SHADER_READ_BIT, VK_ACCESS_TRANSFER_WRITE_BIT, VK_IMAGE_LAYOUT_UNDEFINED, VK_IMAGE_LAYOUT_TRANSFER_DST_OPTIMAL, - VK_PIPELINE_STAGE_FRAGMENT_SHADER_BIT, VK_PIPELINE_STAGE_TRANSFER_BIT); + VK_PIPELINE_STAGE_FRAGMENT_SHADER_BIT | VK_PIPELINE_STAGE_COMPUTE_SHADER_BIT, VK_PIPELINE_STAGE_TRANSFER_BIT); VkBufferImageCopy region{}; region.bufferOffset = offset; region.bufferRowLength = 0; region.bufferImageHeight = 0; region.imageSubresource.aspectMask = VK_IMAGE_ASPECT_COLOR_BIT; region.imageSubresource.mipLevel = level; region.imageSubresource.baseArrayLayer = layer; region.imageSubresource.layerCount = 1; region.imageOffset = {static_cast(xOffset), static_cast(yOffset), 0}; region.imageExtent = {width, height, 1}; vkCmdCopyBufferToImage( commandBuffer, m_StagingBuffer->GetVkBuffer(), image, VK_IMAGE_LAYOUT_TRANSFER_DST_OPTIMAL, 1, ®ion); VkAccessFlags dstAccessFlags = VK_ACCESS_SHADER_READ_BIT; - VkPipelineStageFlags dstStageMask = VK_PIPELINE_STAGE_FRAGMENT_SHADER_BIT; + VkPipelineStageFlags dstStageMask = VK_PIPELINE_STAGE_FRAGMENT_SHADER_BIT | VK_PIPELINE_STAGE_COMPUTE_SHADER_BIT; Utilities::SubmitImageMemoryBarrier( commandBuffer, image, level, layer, VK_ACCESS_TRANSFER_WRITE_BIT, dstAccessFlags, VK_IMAGE_LAYOUT_TRANSFER_DST_OPTIMAL, VK_IMAGE_LAYOUT_SHADER_READ_ONLY_OPTIMAL, VK_PIPELINE_STAGE_TRANSFER_BIT, dstStageMask); texture->SetInitialized(); } void CRingCommandContext::ScheduleUpload( CBuffer* buffer, const void* data, const uint32_t dataOffset, const uint32_t dataSize) { constexpr uint32_t alignment = 16; const uint32_t offset = AcquireFreeSpace(dataSize, alignment); std::memcpy(static_cast(m_StagingBuffer->GetMappedData()) + offset, data, dataSize); ScheduleUpload(buffer, dataOffset, dataSize, offset); } void CRingCommandContext::ScheduleUpload( CBuffer* buffer, const uint32_t dataOffset, const uint32_t dataSize, const UploadBufferFunction& uploadFunction) { constexpr uint32_t alignment = 16; const uint32_t offset = AcquireFreeSpace(dataSize, alignment); CBuffer* stagingBuffer = m_StagingBuffer->As(); uploadFunction(static_cast(stagingBuffer->GetMappedData()) + offset - dataOffset); ScheduleUpload(buffer, dataOffset, dataSize, offset); } void CRingCommandContext::ScheduleUpload( CBuffer* buffer, const uint32_t dataOffset, const uint32_t dataSize, const uint32_t acquiredOffset) { CBuffer* stagingBuffer = m_StagingBuffer->As(); VkCommandBuffer commandBuffer = GetCommandBuffer(); VkBufferCopy region{}; region.srcOffset = acquiredOffset; region.dstOffset = dataOffset; region.size = dataSize; // TODO: remove transfer mask from pipeline barrier, as we need to batch copies. VkPipelineStageFlags srcStageMask = VK_PIPELINE_STAGE_TRANSFER_BIT; VkPipelineStageFlags dstStageMask = VK_PIPELINE_STAGE_TRANSFER_BIT; if (buffer->GetType() == IBuffer::Type::VERTEX || buffer->GetType() == IBuffer::Type::INDEX) srcStageMask = VK_PIPELINE_STAGE_VERTEX_INPUT_BIT; else if (buffer->GetType() == IBuffer::Type::UNIFORM) - srcStageMask = VK_PIPELINE_STAGE_VERTEX_SHADER_BIT | VK_PIPELINE_STAGE_FRAGMENT_SHADER_BIT; + srcStageMask = VK_PIPELINE_STAGE_VERTEX_SHADER_BIT | VK_PIPELINE_STAGE_FRAGMENT_SHADER_BIT | VK_PIPELINE_STAGE_COMPUTE_SHADER_BIT; Utilities::SubmitPipelineBarrier( commandBuffer, srcStageMask, dstStageMask); // TODO: currently we might overwrite data which triggers validation // assertion about Write-After-Write hazard. if (buffer->IsDynamic()) { Utilities::SubmitBufferMemoryBarrier( commandBuffer, buffer, dataOffset, dataSize, VK_ACCESS_TRANSFER_WRITE_BIT, VK_ACCESS_TRANSFER_WRITE_BIT, VK_PIPELINE_STAGE_TRANSFER_BIT, VK_PIPELINE_STAGE_TRANSFER_BIT); } vkCmdCopyBuffer( commandBuffer, stagingBuffer->GetVkBuffer(), buffer->GetVkBuffer(), 1, ®ion); VkAccessFlags srcAccessFlags = VK_ACCESS_TRANSFER_WRITE_BIT; VkAccessFlags dstAccessFlags = 0; srcStageMask = VK_PIPELINE_STAGE_TRANSFER_BIT; dstStageMask = 0; if (buffer->GetType() == IBuffer::Type::VERTEX) { dstAccessFlags = VK_ACCESS_VERTEX_ATTRIBUTE_READ_BIT; dstStageMask = VK_PIPELINE_STAGE_VERTEX_INPUT_BIT; } else if (buffer->GetType() == IBuffer::Type::INDEX) { dstAccessFlags = VK_ACCESS_INDEX_READ_BIT; dstStageMask = VK_PIPELINE_STAGE_VERTEX_INPUT_BIT; } else if (buffer->GetType() == IBuffer::Type::UNIFORM) { dstAccessFlags = VK_ACCESS_UNIFORM_READ_BIT; - dstStageMask = VK_PIPELINE_STAGE_VERTEX_SHADER_BIT | VK_PIPELINE_STAGE_FRAGMENT_SHADER_BIT; + dstStageMask = VK_PIPELINE_STAGE_VERTEX_SHADER_BIT | VK_PIPELINE_STAGE_FRAGMENT_SHADER_BIT | VK_PIPELINE_STAGE_COMPUTE_SHADER_BIT; } Utilities::SubmitBufferMemoryBarrier( commandBuffer, buffer, dataOffset, dataSize, srcAccessFlags, dstAccessFlags, srcStageMask, dstStageMask); } void CRingCommandContext::Begin() { RingItem& item = m_Ring[m_RingIndex]; item.isBegan = true; WaitUntilFree(item); m_StagingBufferCurrentFirst = m_StagingBufferLast; ENSURE_VK_SUCCESS(vkResetCommandPool(m_Device->GetVkDevice(), item.commandPool, 0)); VkCommandBufferBeginInfo beginInfo{}; beginInfo.sType = VK_STRUCTURE_TYPE_COMMAND_BUFFER_BEGIN_INFO; beginInfo.flags = VK_COMMAND_BUFFER_USAGE_ONE_TIME_SUBMIT_BIT; beginInfo.pInheritanceInfo = nullptr; ENSURE_VK_SUCCESS(vkBeginCommandBuffer(item.commandBuffer, &beginInfo)); } void CRingCommandContext::End() { RingItem& item = m_Ring[m_RingIndex]; item.isBegan = false; item.stagingBufferFirst = m_StagingBufferCurrentFirst; item.stagingBufferLast = m_StagingBufferLast; ENSURE_VK_SUCCESS(vkEndCommandBuffer(item.commandBuffer)); } void CRingCommandContext::WaitUntilFree(RingItem& item) { m_SubmitScheduler.WaitUntilFree(item.handle); if (item.stagingBufferFirst != item.stagingBufferLast) { m_StagingBufferFirst = item.stagingBufferLast; item.stagingBufferFirst = 0; item.stagingBufferLast = 0; } } uint32_t CRingCommandContext::AcquireFreeSpace( const uint32_t requiredSize, const uint32_t requiredAlignment) { ENSURE(requiredSize <= m_MaxStagingBufferCapacity); const uint32_t offsetCandidate = GetFreeSpaceOffset(requiredSize, requiredAlignment); const bool needsResize = !m_StagingBuffer || offsetCandidate == INVALID_OFFSET; const bool canResize = !m_StagingBuffer || m_StagingBuffer->GetSize() < m_MaxStagingBufferCapacity; if (needsResize && canResize) { const uint32_t minimumRequiredCapacity = round_up_to_pow2(requiredSize); const uint32_t newCapacity = std::min( std::max(m_StagingBuffer ? m_StagingBuffer->GetSize() * 2 : INITIAL_STAGING_BUFFER_CAPACITY, minimumRequiredCapacity), m_MaxStagingBufferCapacity); m_StagingBuffer = m_Device->CreateCBuffer( "UploadRingBuffer", IBuffer::Type::UPLOAD, newCapacity, false); ENSURE(m_StagingBuffer); m_StagingBufferFirst = 0; m_StagingBufferCurrentFirst = 0; m_StagingBufferLast = requiredSize; for (RingItem& item : m_Ring) { item.stagingBufferFirst = 0; item.stagingBufferLast = 0; } return 0; } else if (needsResize) { // In case we can't resize we need to wait until all scheduled uploads are // completed. for (size_t ringIndexOffset = 1; ringIndexOffset < m_Ring.size() && GetFreeSpaceOffset(requiredSize, requiredAlignment) == INVALID_OFFSET; ++ringIndexOffset) { const size_t ringIndex = (m_RingIndex + ringIndexOffset) % m_Ring.size(); RingItem& item = m_Ring[ringIndex]; WaitUntilFree(item); } // If we still don't have a free space it means we need to flush the // current command buffer. const uint32_t offset = GetFreeSpaceOffset(requiredSize, requiredAlignment); if (offset == INVALID_OFFSET) { RingItem& item = m_Ring[m_RingIndex]; if (item.isBegan) Flush(); WaitUntilFree(item); m_StagingBufferFirst = 0; m_StagingBufferCurrentFirst = 0; m_StagingBufferLast = requiredSize; return 0; } else { m_StagingBufferLast = offset + requiredSize; return offset; } } else { m_StagingBufferLast = offsetCandidate + requiredSize; return offsetCandidate; } } uint32_t CRingCommandContext::GetFreeSpaceOffset( const uint32_t requiredSize, const uint32_t requiredAlignment) const { if (!m_StagingBuffer) return INVALID_OFFSET; const uint32_t candidateOffset = round_up(m_StagingBufferLast, requiredAlignment); const uint32_t candidateLast = candidateOffset + requiredSize; if (m_StagingBufferFirst <= m_StagingBufferLast) { if (candidateLast <= m_StagingBuffer->GetSize()) return candidateOffset; // We intentionally use exclusive comparison to avoid distinguishing // completely full and completely empty staging buffers. else if (requiredSize < m_StagingBufferFirst) return 0; // We assume the first byte is always perfectly aligned. else return INVALID_OFFSET; } else { if (candidateLast < m_StagingBufferFirst) return candidateOffset; else return INVALID_OFFSET; } } } // namespace Vulkan } // namespace Backend } // namespace Renderer Index: ps/trunk/source/renderer/backend/vulkan/SwapChain.cpp =================================================================== --- ps/trunk/source/renderer/backend/vulkan/SwapChain.cpp (revision 28009) +++ ps/trunk/source/renderer/backend/vulkan/SwapChain.cpp (revision 28010) @@ -1,396 +1,398 @@ -/* Copyright (C) 2023 Wildfire Games. +/* Copyright (C) 2024 Wildfire Games. * This file is part of 0 A.D. * * 0 A.D. is free software: you can redistribute it and/or modify * it under the terms of the GNU General Public License as published by * the Free Software Foundation, either version 2 of the License, or * (at your option) any later version. * * 0 A.D. is distributed in the hope that it will be useful, * but WITHOUT ANY WARRANTY; without even the implied warranty of * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the * GNU General Public License for more details. * * You should have received a copy of the GNU General Public License * along with 0 A.D. If not, see . */ #include "precompiled.h" #include "SwapChain.h" #include "lib/hash.h" #include "maths/MathUtil.h" #include "ps/ConfigDB.h" #include "ps/Profile.h" #include "renderer/backend/vulkan/Device.h" #include "renderer/backend/vulkan/Framebuffer.h" #include "renderer/backend/vulkan/RingCommandContext.h" #include "renderer/backend/vulkan/Texture.h" #include "renderer/backend/vulkan/Utilities.h" #include #include namespace Renderer { namespace Backend { namespace Vulkan { // static std::unique_ptr CSwapChain::Create( CDevice* device, VkSurfaceKHR surface, int surfaceDrawableWidth, int surfaceDrawableHeight, std::unique_ptr oldSwapChain) { VkPhysicalDevice physicalDevice = device->GetChoosenPhysicalDevice().device; VkSurfaceCapabilitiesKHR surfaceCapabilities{}; ENSURE_VK_SUCCESS(vkGetPhysicalDeviceSurfaceCapabilitiesKHR( physicalDevice, surface, &surfaceCapabilities)); const uint32_t swapChainWidth = Clamp(surfaceDrawableWidth, surfaceCapabilities.minImageExtent.width, surfaceCapabilities.maxImageExtent.width); const uint32_t swapChainHeight = Clamp(surfaceDrawableHeight, surfaceCapabilities.minImageExtent.height, surfaceCapabilities.maxImageExtent.height); // Some drivers (for example NVIDIA on Windows during minimize) might // return zeroes for both minImageExtent and maxImageExtent. It means we're // not able to create any swapchain. Because we can't choose zeros (they're // not allowed) and we can't choose values bigger than maxImageExtent // (which are also zeroes in that case). if (swapChainWidth == 0 || swapChainHeight == 0) return nullptr; std::vector surfaceFormats; uint32_t surfaceFormatCount = 0; ENSURE_VK_SUCCESS(vkGetPhysicalDeviceSurfaceFormatsKHR( physicalDevice, surface, &surfaceFormatCount, nullptr)); if (surfaceFormatCount > 0) { surfaceFormats.resize(surfaceFormatCount); ENSURE_VK_SUCCESS(vkGetPhysicalDeviceSurfaceFormatsKHR( physicalDevice, surface, &surfaceFormatCount, surfaceFormats.data())); } std::vector presentModes; uint32_t presentModeCount = 0; ENSURE_VK_SUCCESS(vkGetPhysicalDeviceSurfacePresentModesKHR( physicalDevice, surface, &presentModeCount, nullptr)); if (presentModeCount > 0) { presentModes.resize(presentModeCount); ENSURE_VK_SUCCESS(vkGetPhysicalDeviceSurfacePresentModesKHR( physicalDevice, surface, &presentModeCount, presentModes.data())); } // VK_PRESENT_MODE_FIFO_KHR is guaranteed to be supported. VkPresentModeKHR presentMode = VK_PRESENT_MODE_FIFO_KHR; auto isPresentModeAvailable = [&presentModes](const VkPresentModeKHR presentMode) { return std::find(presentModes.begin(), presentModes.end(), presentMode) != presentModes.end(); }; bool vsyncEnabled = true; CFG_GET_VAL("vsync", vsyncEnabled); if (vsyncEnabled) { // TODO: use the adaptive one when possible. // https://gitlab.freedesktop.org/mesa/mesa/-/issues/5516 //if (isPresentModeAvailable(VK_PRESENT_MODE_MAILBOX_KHR)) // presentMode = VK_PRESENT_MODE_MAILBOX_KHR; } else { if (isPresentModeAvailable(VK_PRESENT_MODE_IMMEDIATE_KHR)) presentMode = VK_PRESENT_MODE_IMMEDIATE_KHR; } // Spec says: // The number of format pairs supported must be greater than or equal to 1. // pSurfaceFormats must not contain an entry whose value for format is // VK_FORMAT_UNDEFINED. const auto surfaceFormatIt = std::find_if(surfaceFormats.begin(), surfaceFormats.end(), IsSurfaceFormatSupported); if (surfaceFormatIt == surfaceFormats.end()) { LOGERROR("Can't find a suitable surface format to render to."); return nullptr; } const VkSurfaceFormatKHR& surfaceFormat = *surfaceFormatIt; VkSwapchainCreateInfoKHR swapChainCreateInfo{}; swapChainCreateInfo.sType = VK_STRUCTURE_TYPE_SWAPCHAIN_CREATE_INFO_KHR; swapChainCreateInfo.surface = surface; // minImageCount + 1 is to have a less chance for a presenter to wait. // maxImageCount might be zero, it means it's unlimited. const uint32_t maxImageCount = surfaceCapabilities.maxImageCount > 0 ? surfaceCapabilities.maxImageCount : std::numeric_limits::max(); const uint32_t minImageCount = surfaceCapabilities.minImageCount < maxImageCount ? surfaceCapabilities.minImageCount + 1 : surfaceCapabilities.minImageCount; swapChainCreateInfo.minImageCount = Clamp(NUMBER_OF_FRAMES_IN_FLIGHT, minImageCount, maxImageCount); swapChainCreateInfo.imageFormat = surfaceFormat.format; swapChainCreateInfo.imageColorSpace = surfaceFormat.colorSpace; swapChainCreateInfo.imageExtent.width = swapChainWidth; swapChainCreateInfo.imageExtent.height = swapChainHeight; swapChainCreateInfo.imageArrayLayers = 1; // VK_IMAGE_USAGE_COLOR_ATTACHMENT_BIT is guaranteed to present. // VK_IMAGE_USAGE_TRANSFER_SRC_BIT allows a simpler backbuffer readback. // VK_IMAGE_USAGE_TRANSFER_DST_BIT allows a blit to the backbuffer. + // VK_IMAGE_USAGE_STORAGE_BIT allows to write to the backbuffer directly + // from a compute shader. swapChainCreateInfo.imageUsage = - (VK_IMAGE_USAGE_COLOR_ATTACHMENT_BIT | VK_IMAGE_USAGE_TRANSFER_SRC_BIT | VK_IMAGE_USAGE_TRANSFER_DST_BIT) & - surfaceCapabilities.supportedUsageFlags; + (VK_IMAGE_USAGE_COLOR_ATTACHMENT_BIT | VK_IMAGE_USAGE_TRANSFER_SRC_BIT | VK_IMAGE_USAGE_TRANSFER_DST_BIT | VK_IMAGE_USAGE_STORAGE_BIT) & + surfaceCapabilities.supportedUsageFlags; swapChainCreateInfo.imageSharingMode = VK_SHARING_MODE_EXCLUSIVE; // We need to set these only if imageSharingMode is VK_SHARING_MODE_CONCURRENT. swapChainCreateInfo.queueFamilyIndexCount = 0; swapChainCreateInfo.pQueueFamilyIndices = nullptr; // By default VK_SURFACE_TRANSFORM_IDENTITY_BIT_KHR is preferable. if (surfaceCapabilities.supportedTransforms & VK_SURFACE_TRANSFORM_IDENTITY_BIT_KHR) swapChainCreateInfo.preTransform = VK_SURFACE_TRANSFORM_IDENTITY_BIT_KHR; else swapChainCreateInfo.preTransform = surfaceCapabilities.currentTransform; // By default VK_COMPOSITE_ALPHA_OPAQUE_BIT_KHR is preferable, other bits // might require some format or rendering adjustemnts to avoid // semi-transparent areas. const VkCompositeAlphaFlagBitsKHR compositeAlphaOrder[] = { VK_COMPOSITE_ALPHA_OPAQUE_BIT_KHR, VK_COMPOSITE_ALPHA_INHERIT_BIT_KHR, VK_COMPOSITE_ALPHA_PRE_MULTIPLIED_BIT_KHR, VK_COMPOSITE_ALPHA_POST_MULTIPLIED_BIT_KHR }; for (const VkCompositeAlphaFlagBitsKHR compositeAlpha : compositeAlphaOrder) if (compositeAlpha & surfaceCapabilities.supportedCompositeAlpha) { swapChainCreateInfo.compositeAlpha = compositeAlpha; break; } swapChainCreateInfo.presentMode = presentMode; swapChainCreateInfo.clipped = VK_TRUE; if (oldSwapChain) swapChainCreateInfo.oldSwapchain = oldSwapChain->GetVkSwapchain(); std::unique_ptr swapChain(new CSwapChain()); swapChain->m_Device = device; ENSURE_VK_SUCCESS(vkCreateSwapchainKHR( device->GetVkDevice(), &swapChainCreateInfo, nullptr, &swapChain->m_SwapChain)); char nameBuffer[64]; snprintf(nameBuffer, std::size(nameBuffer), "SwapChain: %dx%d", surfaceDrawableWidth, surfaceDrawableHeight); device->SetObjectName(VK_OBJECT_TYPE_SWAPCHAIN_KHR, swapChain->m_SwapChain, nameBuffer); uint32_t imageCount = 0; VkResult getSwapchainImagesResult = VK_INCOMPLETE; do { getSwapchainImagesResult = vkGetSwapchainImagesKHR( device->GetVkDevice(), swapChain->m_SwapChain, &imageCount, nullptr); if (getSwapchainImagesResult == VK_SUCCESS && imageCount > 0) { swapChain->m_Images.resize(imageCount); getSwapchainImagesResult = vkGetSwapchainImagesKHR( device->GetVkDevice(), swapChain->m_SwapChain, &imageCount, swapChain->m_Images.data()); } } while (getSwapchainImagesResult == VK_INCOMPLETE); LOGMESSAGE("SwapChain image count: %u (min: %u)", imageCount, swapChainCreateInfo.minImageCount); ENSURE_VK_SUCCESS(getSwapchainImagesResult); ENSURE(imageCount > 0); swapChain->m_DepthTexture = CTexture::Create( device, "SwapChainDepthTexture", ITexture::Type::TEXTURE_2D, ITexture::Usage::DEPTH_STENCIL_ATTACHMENT, device->GetPreferredDepthStencilFormat( Renderer::Backend::ITexture::Usage::DEPTH_STENCIL_ATTACHMENT, true, true), swapChainWidth, swapChainHeight, Sampler::MakeDefaultSampler( Sampler::Filter::NEAREST, Sampler::AddressMode::CLAMP_TO_EDGE), 1, 1); swapChain->m_ImageFormat = swapChainCreateInfo.imageFormat; swapChain->m_Textures.resize(imageCount); swapChain->m_Backbuffers.resize(imageCount); for (size_t index = 0; index < imageCount; ++index) { snprintf(nameBuffer, std::size(nameBuffer), "SwapChainImage #%zu", index); device->SetObjectName(VK_OBJECT_TYPE_IMAGE, swapChain->m_Images[index], nameBuffer); snprintf(nameBuffer, std::size(nameBuffer), "SwapChainImageView #%zu", index); swapChain->m_Textures[index] = CTexture::WrapBackbufferImage( device, nameBuffer, swapChain->m_Images[index], swapChainCreateInfo.imageFormat, swapChainCreateInfo.imageUsage, swapChainWidth, swapChainHeight); } swapChain->m_IsValid = true; return swapChain; } CSwapChain::CSwapChain() = default; CSwapChain::~CSwapChain() { m_Backbuffers.clear(); m_Textures.clear(); m_DepthTexture.reset(); if (m_SwapChain != VK_NULL_HANDLE) vkDestroySwapchainKHR(m_Device->GetVkDevice(), m_SwapChain, nullptr); } size_t CSwapChain::SwapChainBackbuffer::BackbufferKeyHash::operator()(const BackbufferKey& key) const { size_t seed = 0; hash_combine(seed, std::get<0>(key)); hash_combine(seed, std::get<1>(key)); hash_combine(seed, std::get<2>(key)); hash_combine(seed, std::get<3>(key)); return seed; } CSwapChain::SwapChainBackbuffer::SwapChainBackbuffer() = default; CSwapChain::SwapChainBackbuffer::SwapChainBackbuffer(SwapChainBackbuffer&& other) = default; CSwapChain::SwapChainBackbuffer& CSwapChain::SwapChainBackbuffer::operator=(SwapChainBackbuffer&& other) = default; bool CSwapChain::AcquireNextImage(VkSemaphore acquireImageSemaphore) { ENSURE(m_CurrentImageIndex == std::numeric_limits::max()); const VkResult acquireResult = vkAcquireNextImageKHR( m_Device->GetVkDevice(), m_SwapChain, std::numeric_limits::max(), acquireImageSemaphore, VK_NULL_HANDLE, &m_CurrentImageIndex); if (acquireResult != VK_SUCCESS) { if (acquireResult == VK_ERROR_OUT_OF_DATE_KHR) m_IsValid = false; else if (acquireResult != VK_SUBOPTIMAL_KHR) { LOGERROR("Acquire result: %d (%s)", static_cast(acquireResult), Utilities::GetVkResultName(acquireResult)); debug_warn("Unknown acquire error."); } } return m_IsValid; } void CSwapChain::SubmitCommandsAfterAcquireNextImage( CRingCommandContext& commandContext) { const bool firstAcquirement = !m_Textures[m_CurrentImageIndex]->IsInitialized(); Utilities::SubmitImageMemoryBarrier( commandContext.GetCommandBuffer(), m_Images[m_CurrentImageIndex], 0, 0, 0, VK_ACCESS_COLOR_ATTACHMENT_WRITE_BIT, firstAcquirement ? VK_IMAGE_LAYOUT_UNDEFINED : VK_IMAGE_LAYOUT_PRESENT_SRC_KHR, VK_IMAGE_LAYOUT_COLOR_ATTACHMENT_OPTIMAL, VK_PIPELINE_STAGE_COLOR_ATTACHMENT_OUTPUT_BIT, VK_PIPELINE_STAGE_COLOR_ATTACHMENT_OUTPUT_BIT); if (!m_DepthTexture->IsInitialized()) { Utilities::SubmitImageMemoryBarrier( commandContext.GetCommandBuffer(), m_DepthTexture->GetImage(), 0, 0, 0, VK_ACCESS_DEPTH_STENCIL_ATTACHMENT_READ_BIT | VK_ACCESS_DEPTH_STENCIL_ATTACHMENT_WRITE_BIT, VK_IMAGE_LAYOUT_UNDEFINED, VK_IMAGE_LAYOUT_DEPTH_STENCIL_ATTACHMENT_OPTIMAL, VK_PIPELINE_STAGE_TOP_OF_PIPE_BIT, VK_PIPELINE_STAGE_EARLY_FRAGMENT_TESTS_BIT, VK_IMAGE_ASPECT_DEPTH_BIT | VK_IMAGE_ASPECT_STENCIL_BIT); } } void CSwapChain::SubmitCommandsBeforePresent( CRingCommandContext& commandContext) { ENSURE(m_CurrentImageIndex != std::numeric_limits::max()); Utilities::SubmitImageMemoryBarrier( commandContext.GetCommandBuffer(), m_Images[m_CurrentImageIndex], 0, 0, VK_ACCESS_COLOR_ATTACHMENT_WRITE_BIT, 0, VK_IMAGE_LAYOUT_COLOR_ATTACHMENT_OPTIMAL, VK_IMAGE_LAYOUT_PRESENT_SRC_KHR, VK_PIPELINE_STAGE_COLOR_ATTACHMENT_OUTPUT_BIT, VK_PIPELINE_STAGE_BOTTOM_OF_PIPE_BIT); } void CSwapChain::Present(VkSemaphore submitDone, VkQueue queue) { ENSURE(m_CurrentImageIndex != std::numeric_limits::max()); VkSwapchainKHR swapChains[] = {m_SwapChain}; VkPresentInfoKHR presentInfo{}; presentInfo.sType = VK_STRUCTURE_TYPE_PRESENT_INFO_KHR; presentInfo.swapchainCount = 1; presentInfo.pSwapchains = swapChains; presentInfo.pImageIndices = &m_CurrentImageIndex; presentInfo.waitSemaphoreCount = 1; presentInfo.pWaitSemaphores = &submitDone; const VkResult presentResult = vkQueuePresentKHR(queue, &presentInfo); if (presentResult != VK_SUCCESS) { if (presentResult == VK_ERROR_OUT_OF_DATE_KHR) m_IsValid = false; else if (presentResult != VK_SUBOPTIMAL_KHR) { LOGERROR("Present result: %d (%s)", static_cast(presentResult), Utilities::GetVkResultName(presentResult)); debug_warn("Unknown present error."); } } m_CurrentImageIndex = std::numeric_limits::max(); } CFramebuffer* CSwapChain::GetCurrentBackbuffer( const AttachmentLoadOp colorAttachmentLoadOp, const AttachmentStoreOp colorAttachmentStoreOp, const AttachmentLoadOp depthStencilAttachmentLoadOp, const AttachmentStoreOp depthStencilAttachmentStoreOp) { ENSURE(m_CurrentImageIndex != std::numeric_limits::max()); SwapChainBackbuffer& swapChainBackbuffer = m_Backbuffers[m_CurrentImageIndex]; const SwapChainBackbuffer::BackbufferKey key{ colorAttachmentLoadOp, colorAttachmentStoreOp, depthStencilAttachmentLoadOp, depthStencilAttachmentStoreOp}; auto it = swapChainBackbuffer.backbuffers.find(key); if (it == swapChainBackbuffer.backbuffers.end()) { char nameBuffer[64]; snprintf(nameBuffer, std::size(nameBuffer), "Backbuffer #%u", m_CurrentImageIndex); SColorAttachment colorAttachment{}; colorAttachment.texture = m_Textures[m_CurrentImageIndex].get(); colorAttachment.loadOp = colorAttachmentLoadOp; colorAttachment.storeOp = colorAttachmentStoreOp; SDepthStencilAttachment depthStencilAttachment{}; depthStencilAttachment.texture = m_DepthTexture.get(); depthStencilAttachment.loadOp = depthStencilAttachmentLoadOp; depthStencilAttachment.storeOp = depthStencilAttachmentStoreOp; it = swapChainBackbuffer.backbuffers.emplace(key, CFramebuffer::Create( m_Device, nameBuffer, &colorAttachment, &depthStencilAttachment)).first; } return it->second.get(); } CTexture* CSwapChain::GetCurrentBackbufferTexture() { ENSURE(m_CurrentImageIndex != std::numeric_limits::max()); return m_Textures[m_CurrentImageIndex].get(); } } // namespace Vulkan } // namespace Backend } // namespace Renderer Index: ps/trunk/source/renderer/backend/vulkan/ShaderProgram.h =================================================================== --- ps/trunk/source/renderer/backend/vulkan/ShaderProgram.h (revision 28009) +++ ps/trunk/source/renderer/backend/vulkan/ShaderProgram.h (revision 28010) @@ -1,183 +1,191 @@ -/* Copyright (C) 2023 Wildfire Games. +/* Copyright (C) 2024 Wildfire Games. * This file is part of 0 A.D. * * 0 A.D. is free software: you can redistribute it and/or modify * it under the terms of the GNU General Public License as published by * the Free Software Foundation, either version 2 of the License, or * (at your option) any later version. * * 0 A.D. is distributed in the hope that it will be useful, * but WITHOUT ANY WARRANTY; without even the implied warranty of * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the * GNU General Public License for more details. * * You should have received a copy of the GNU General Public License * along with 0 A.D. If not, see . */ #ifndef INCLUDED_RENDERER_BACKEND_VULKAN_SHADERPROGRAM #define INCLUDED_RENDERER_BACKEND_VULKAN_SHADERPROGRAM #include "renderer/backend/IShaderProgram.h" +#include "renderer/backend/vulkan/Buffer.h" #include "renderer/backend/vulkan/DescriptorManager.h" #include "renderer/backend/vulkan/Texture.h" #include #include #include #include #include #include #include class CShaderDefines; class CStr; namespace Renderer { namespace Backend { namespace Vulkan { class CDevice; class CRingCommandContext; class CVertexInputLayout : public IVertexInputLayout { public: CVertexInputLayout(CDevice* device, const PS::span attributes) : m_Device(device), m_Attributes(attributes.begin(), attributes.end()) { static uint32_t m_LastAvailableUID = 1; m_UID = m_LastAvailableUID++; for (const SVertexAttributeFormat& attribute : m_Attributes) { ENSURE(attribute.format != Format::UNDEFINED); ENSURE(attribute.stride > 0); } } ~CVertexInputLayout() override = default; IDevice* GetDevice() override; const std::vector& GetAttributes() const noexcept { return m_Attributes; } using UID = uint32_t; UID GetUID() const { return m_UID; } private: CDevice* m_Device = nullptr; UID m_UID = 0; std::vector m_Attributes; }; class CShaderProgram final : public IShaderProgram { public: ~CShaderProgram() override; IDevice* GetDevice() override; int32_t GetBindingSlot(const CStrIntern name) const override; std::vector GetFileDependencies() const override; uint32_t GetStreamLocation(const VertexAttributeStream stream) const; const std::vector& GetStages() const { return m_Stages; } void Bind(); void Unbind(); + void PreDraw(CRingCommandContext& commandContext); + void PreDispatch(CRingCommandContext& commandContext); + void PostDispatch(CRingCommandContext& commandContext); VkPipelineLayout GetPipelineLayout() const { return m_PipelineLayout; } - VkPipelineBindPoint GetPipelineBindPoint() const { return VK_PIPELINE_BIND_POINT_GRAPHICS; } + VkPipelineBindPoint GetPipelineBindPoint() const { return m_PipelineBindPoint; } void SetUniform( const int32_t bindingSlot, const float value); void SetUniform( const int32_t bindingSlot, const float valueX, const float valueY); void SetUniform( const int32_t bindingSlot, const float valueX, const float valueY, const float valueZ); void SetUniform( const int32_t bindingSlot, const float valueX, const float valueY, const float valueZ, const float valueW); void SetUniform( const int32_t bindingSlot, PS::span values); + void SetTexture(const int32_t bindingSlot, CTexture* texture); + void SetStorageTexture(const int32_t bindingSlot, CTexture* texture); // TODO: rename to something related to buffer. bool IsMaterialConstantsDataOutdated() const { return m_MaterialConstantsDataOutdated; } void UpdateMaterialConstantsData() { m_MaterialConstantsDataOutdated = false; } std::byte* GetMaterialConstantsData() const { return m_MaterialConstantsData.get(); } uint32_t GetMaterialConstantsDataSize() const { return m_MaterialConstantsDataSize; } private: friend class CDevice; CShaderProgram(); std::pair GetUniformData( const int32_t bindingSlot, const uint32_t dataSize); static std::unique_ptr Create( CDevice* device, const CStr& name, const CShaderDefines& defines); void BindOutdatedDescriptorSets( CRingCommandContext& commandContext); CDevice* m_Device = nullptr; std::vector m_ShaderModules; std::vector m_Stages; VkPipelineLayout m_PipelineLayout = VK_NULL_HANDLE; + VkPipelineBindPoint m_PipelineBindPoint = VK_PIPELINE_BIND_POINT_MAX_ENUM; std::vector m_FileDependencies; struct PushConstant { CStrIntern name; uint32_t offset; uint32_t size; VkShaderStageFlags stageFlags; }; struct Uniform { CStrIntern name; uint32_t offset; uint32_t size; }; std::unique_ptr m_MaterialConstantsData; uint32_t m_MaterialConstantsDataSize = 0; bool m_MaterialConstantsDataOutdated = false; std::array m_PushConstantData; uint32_t m_PushConstantDataMask = 0; std::array m_PushConstantDataFlags; std::vector m_PushConstants; std::vector m_Uniforms; std::unordered_map m_UniformMapping; std::unordered_map m_PushConstantMapping; std::optional> m_TextureBinding; + std::optional> m_StorageImageBinding; std::unordered_map m_StreamLocations; }; } // namespace Vulkan } // namespace Backend } // namespace Renderer #endif // INCLUDED_RENDERER_BACKEND_VULKAN_SHADERPROGRAM Index: ps/trunk/source/renderer/backend/vulkan/Texture.cpp =================================================================== --- ps/trunk/source/renderer/backend/vulkan/Texture.cpp (revision 28009) +++ ps/trunk/source/renderer/backend/vulkan/Texture.cpp (revision 28010) @@ -1,369 +1,388 @@ -/* Copyright (C) 2023 Wildfire Games. +/* Copyright (C) 2024 Wildfire Games. * This file is part of 0 A.D. * * 0 A.D. is free software: you can redistribute it and/or modify * it under the terms of the GNU General Public License as published by * the Free Software Foundation, either version 2 of the License, or * (at your option) any later version. * * 0 A.D. is distributed in the hope that it will be useful, * but WITHOUT ANY WARRANTY; without even the implied warranty of * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the * GNU General Public License for more details. * * You should have received a copy of the GNU General Public License * along with 0 A.D. If not, see . */ #include "precompiled.h" #include "Texture.h" #include "renderer/backend/vulkan/Device.h" #include "renderer/backend/vulkan/Mapping.h" #include "renderer/backend/vulkan/SamplerManager.h" #include "renderer/backend/vulkan/Utilities.h" namespace Renderer { namespace Backend { namespace Vulkan { // static std::unique_ptr CTexture::Create( CDevice* device, const char* name, const Type type, const uint32_t usage, const Format format, const uint32_t width, const uint32_t height, const Sampler::Desc& defaultSamplerDesc, const uint32_t MIPLevelCount, const uint32_t sampleCount) { std::unique_ptr texture(new CTexture(device)); texture->m_Format = format; texture->m_Type = type; texture->m_Usage = usage; texture->m_Width = width; texture->m_Height = height; texture->m_MIPLevelCount = MIPLevelCount; texture->m_SampleCount = sampleCount; texture->m_LayerCount = type == ITexture::Type::TEXTURE_CUBE ? 6 : 1; if (type == Type::TEXTURE_2D_MULTISAMPLE) ENSURE(sampleCount > 1); VkFormat imageFormat = VK_FORMAT_UNDEFINED; // A8 and L8 are special cases for GL2.1, because it doesn't have a proper // channel swizzling. if (format == Format::A8_UNORM || format == Format::L8_UNORM) imageFormat = VK_FORMAT_R8_UNORM; else imageFormat = Mapping::FromFormat(format); texture->m_VkFormat = imageFormat; VkImageType imageType = VK_IMAGE_TYPE_2D; VkImageTiling tiling = VK_IMAGE_TILING_OPTIMAL; const VkPhysicalDevice physicalDevice = device->GetChoosenPhysicalDevice().device; VkFormatProperties formatProperties{}; vkGetPhysicalDeviceFormatProperties( physicalDevice, imageFormat, &formatProperties); + if (!(usage & Usage::SAMPLED)) + { + // A texture can't be *_ATTACHMENT and STORAGE at the same time without + // to be SAMPLED. + const bool isAttachment = (usage & Usage::COLOR_ATTACHMENT) || (usage & Usage::DEPTH_STENCIL_ATTACHMENT); + const bool isStorage = usage & Usage::STORAGE; + ENSURE(!(isAttachment && isStorage)); + } + VkImageUsageFlags usageFlags = 0; // Vulkan 1.0 implies that TRANSFER_SRC and TRANSFER_DST are supported. // TODO: account Vulkan 1.1. if (usage & Usage::TRANSFER_SRC) usageFlags |= VK_IMAGE_USAGE_TRANSFER_SRC_BIT; if (usage & Usage::TRANSFER_DST) usageFlags |= VK_IMAGE_USAGE_TRANSFER_DST_BIT; if (usage & Usage::SAMPLED) { ENSURE(type != Type::TEXTURE_2D_MULTISAMPLE); if (!(formatProperties.optimalTilingFeatures & VK_FORMAT_FEATURE_SAMPLED_IMAGE_BIT)) { LOGERROR("Format %d doesn't support sampling for optimal tiling.", static_cast(imageFormat)); return nullptr; } usageFlags |= VK_IMAGE_USAGE_SAMPLED_BIT; } + if (usage & Usage::STORAGE) + { + ENSURE(type != Type::TEXTURE_2D_MULTISAMPLE); + if (!(formatProperties.optimalTilingFeatures & VK_FORMAT_FEATURE_STORAGE_IMAGE_BIT)) + { + LOGERROR("Format %d doesn't support storage for optimal tiling.", static_cast(imageFormat)); + return nullptr; + } + usageFlags |= VK_IMAGE_USAGE_STORAGE_BIT; + } if (usage & Usage::COLOR_ATTACHMENT) { ENSURE(device->IsFramebufferFormatSupported(format)); if (!(formatProperties.optimalTilingFeatures & VK_FORMAT_FEATURE_COLOR_ATTACHMENT_BIT)) { LOGERROR("Format %d doesn't support color attachment for optimal tiling.", static_cast(imageFormat)); return nullptr; } usageFlags |= VK_IMAGE_USAGE_COLOR_ATTACHMENT_BIT; } if (usage & Usage::DEPTH_STENCIL_ATTACHMENT) { ENSURE(IsDepthFormat(format)); if (!(formatProperties.optimalTilingFeatures & VK_FORMAT_FEATURE_DEPTH_STENCIL_ATTACHMENT_BIT)) { LOGERROR("Format %d doesn't support depth stencil attachment for optimal tiling.", static_cast(imageFormat)); return nullptr; } usageFlags |= VK_IMAGE_USAGE_DEPTH_STENCIL_ATTACHMENT_BIT; } if (IsDepthFormat(format)) { texture->m_AttachmentImageAspectMask = VK_IMAGE_ASPECT_DEPTH_BIT; texture->m_SamplerImageAspectMask = VK_IMAGE_ASPECT_DEPTH_BIT; if (format == Format::D24_UNORM_S8_UINT || format == Format::D32_SFLOAT_S8_UINT) texture->m_AttachmentImageAspectMask |= VK_IMAGE_ASPECT_STENCIL_BIT; } else { texture->m_AttachmentImageAspectMask = VK_IMAGE_ASPECT_COLOR_BIT; texture->m_SamplerImageAspectMask = VK_IMAGE_ASPECT_COLOR_BIT; } VkImageCreateInfo imageCreateInfo{}; imageCreateInfo.sType = VK_STRUCTURE_TYPE_IMAGE_CREATE_INFO; imageCreateInfo.imageType = imageType; imageCreateInfo.extent.width = width; imageCreateInfo.extent.height = height; imageCreateInfo.extent.depth = 1; imageCreateInfo.mipLevels = MIPLevelCount; imageCreateInfo.arrayLayers = type == Type::TEXTURE_CUBE ? 6 : 1; imageCreateInfo.format = imageFormat; imageCreateInfo.samples = Mapping::FromSampleCount(sampleCount); imageCreateInfo.tiling = tiling; imageCreateInfo.usage = usageFlags; imageCreateInfo.sharingMode = VK_SHARING_MODE_EXCLUSIVE; imageCreateInfo.initialLayout = VK_IMAGE_LAYOUT_UNDEFINED; if (type == Type::TEXTURE_CUBE) imageCreateInfo.flags = VK_IMAGE_CREATE_CUBE_COMPATIBLE_BIT; VmaAllocationCreateInfo allocationCreateInfo{}; if ((usage & Usage::COLOR_ATTACHMENT) || (usage & Usage::DEPTH_STENCIL_ATTACHMENT)) allocationCreateInfo.flags |= VMA_ALLOCATION_CREATE_DEDICATED_MEMORY_BIT; #ifndef NDEBUG allocationCreateInfo.flags |= VMA_ALLOCATION_CREATE_USER_DATA_COPY_STRING_BIT; allocationCreateInfo.pUserData = const_cast(name); #endif allocationCreateInfo.requiredFlags = VK_MEMORY_PROPERTY_DEVICE_LOCAL_BIT; allocationCreateInfo.usage = VMA_MEMORY_USAGE_AUTO_PREFER_DEVICE; const VkResult createImageResult = vmaCreateImage( device->GetVMAAllocator(), &imageCreateInfo, &allocationCreateInfo, &texture->m_Image, &texture->m_Allocation, nullptr); if (createImageResult != VK_SUCCESS) { LOGERROR("Failed to create VkImage: %d (%s)", static_cast(createImageResult), Utilities::GetVkResultName(createImageResult)); return nullptr; } VkImageViewCreateInfo imageViewCreateInfo{}; imageViewCreateInfo.sType = VK_STRUCTURE_TYPE_IMAGE_VIEW_CREATE_INFO; imageViewCreateInfo.image = texture->m_Image; imageViewCreateInfo.viewType = type == Type::TEXTURE_CUBE ? VK_IMAGE_VIEW_TYPE_CUBE : VK_IMAGE_VIEW_TYPE_2D; imageViewCreateInfo.format = imageFormat; imageViewCreateInfo.subresourceRange.baseMipLevel = 0; imageViewCreateInfo.subresourceRange.levelCount = MIPLevelCount; imageViewCreateInfo.subresourceRange.baseArrayLayer = 0; imageViewCreateInfo.subresourceRange.layerCount = type == Type::TEXTURE_CUBE ? 6 : 1; if (format == Format::A8_UNORM) { imageViewCreateInfo.components.r = VK_COMPONENT_SWIZZLE_ZERO; imageViewCreateInfo.components.g = VK_COMPONENT_SWIZZLE_ZERO; imageViewCreateInfo.components.b = VK_COMPONENT_SWIZZLE_ZERO; imageViewCreateInfo.components.a = VK_COMPONENT_SWIZZLE_R; } else if (format == Format::L8_UNORM) { imageViewCreateInfo.components.r = VK_COMPONENT_SWIZZLE_R; imageViewCreateInfo.components.g = VK_COMPONENT_SWIZZLE_R; imageViewCreateInfo.components.b = VK_COMPONENT_SWIZZLE_R; imageViewCreateInfo.components.a = VK_COMPONENT_SWIZZLE_ONE; } else { imageViewCreateInfo.components.r = VK_COMPONENT_SWIZZLE_IDENTITY; imageViewCreateInfo.components.g = VK_COMPONENT_SWIZZLE_IDENTITY; imageViewCreateInfo.components.b = VK_COMPONENT_SWIZZLE_IDENTITY; imageViewCreateInfo.components.a = VK_COMPONENT_SWIZZLE_IDENTITY; } if ((usage & Usage::COLOR_ATTACHMENT) || (usage & Usage::DEPTH_STENCIL_ATTACHMENT)) { imageViewCreateInfo.subresourceRange.aspectMask = texture->m_AttachmentImageAspectMask; ENSURE_VK_SUCCESS(vkCreateImageView( device->GetVkDevice(), &imageViewCreateInfo, nullptr, &texture->m_AttachmentImageView)); } if (usage & Usage::SAMPLED) { imageViewCreateInfo.subresourceRange.aspectMask = texture->m_SamplerImageAspectMask; ENSURE_VK_SUCCESS(vkCreateImageView( device->GetVkDevice(), &imageViewCreateInfo, nullptr, &texture->m_SamplerImageView)); texture->m_Sampler = device->GetSamplerManager().GetOrCreateSampler( defaultSamplerDesc); texture->m_IsCompareEnabled = defaultSamplerDesc.compareEnabled; } device->SetObjectName(VK_OBJECT_TYPE_IMAGE, texture->m_Image, name); if (texture->m_AttachmentImageView != VK_NULL_HANDLE) device->SetObjectName(VK_OBJECT_TYPE_IMAGE_VIEW, texture->m_AttachmentImageView, name); if (texture->m_SamplerImageView != VK_NULL_HANDLE) device->SetObjectName(VK_OBJECT_TYPE_IMAGE_VIEW, texture->m_SamplerImageView, name); return texture; } // static std::unique_ptr CTexture::WrapBackbufferImage( CDevice* device, const char* name, const VkImage image, const VkFormat format, const VkImageUsageFlags usage, const uint32_t width, const uint32_t height) { std::unique_ptr texture(new CTexture(device)); if (format == VK_FORMAT_R8G8B8A8_UNORM) texture->m_Format = Format::R8G8B8A8_UNORM; else if (format == VK_FORMAT_B8G8R8A8_UNORM) texture->m_Format = Format::B8G8R8A8_UNORM; else texture->m_Format = Format::UNDEFINED; texture->m_Type = Type::TEXTURE_2D; if (usage & VK_IMAGE_USAGE_COLOR_ATTACHMENT_BIT) texture->m_Usage |= Usage::COLOR_ATTACHMENT; if (usage & VK_IMAGE_USAGE_TRANSFER_SRC_BIT) texture->m_Usage |= Usage::TRANSFER_SRC; if (usage & VK_IMAGE_USAGE_TRANSFER_DST_BIT) texture->m_Usage |= Usage::TRANSFER_DST; texture->m_Width = width; texture->m_Height = height; texture->m_MIPLevelCount = 1; texture->m_SampleCount = 1; texture->m_LayerCount = 1; texture->m_VkFormat = format; // The image is owned by its swapchain, but we don't set a special flag // because the ownership is detected by m_Allocation presence. texture->m_Image = image; texture->m_AttachmentImageAspectMask = VK_IMAGE_ASPECT_COLOR_BIT; texture->m_SamplerImageAspectMask = VK_IMAGE_ASPECT_COLOR_BIT; VkImageViewCreateInfo imageViewCreateInfo{}; imageViewCreateInfo.sType = VK_STRUCTURE_TYPE_IMAGE_VIEW_CREATE_INFO; imageViewCreateInfo.image = image; imageViewCreateInfo.viewType = VK_IMAGE_VIEW_TYPE_2D; imageViewCreateInfo.format = format; imageViewCreateInfo.components.r = VK_COMPONENT_SWIZZLE_IDENTITY; imageViewCreateInfo.components.g = VK_COMPONENT_SWIZZLE_IDENTITY; imageViewCreateInfo.components.b = VK_COMPONENT_SWIZZLE_IDENTITY; imageViewCreateInfo.components.a = VK_COMPONENT_SWIZZLE_IDENTITY; imageViewCreateInfo.subresourceRange.aspectMask = VK_IMAGE_ASPECT_COLOR_BIT; imageViewCreateInfo.subresourceRange.baseMipLevel = 0; imageViewCreateInfo.subresourceRange.levelCount = 1; imageViewCreateInfo.subresourceRange.baseArrayLayer = 0; imageViewCreateInfo.subresourceRange.layerCount = 1; ENSURE_VK_SUCCESS(vkCreateImageView( device->GetVkDevice(), &imageViewCreateInfo, nullptr, &texture->m_AttachmentImageView)); device->SetObjectName(VK_OBJECT_TYPE_IMAGE_VIEW, texture->m_AttachmentImageView, name); return texture; } // static std::unique_ptr CTexture::CreateReadback( CDevice* device, const char* name, const Format format, const uint32_t width, const uint32_t height) { std::unique_ptr texture(new CTexture(device)); texture->m_Format = format; texture->m_Type = Type::TEXTURE_2D; texture->m_Usage = Usage::TRANSFER_DST; texture->m_Width = width; texture->m_Height = height; texture->m_MIPLevelCount = 1; texture->m_SampleCount = 1; texture->m_LayerCount = 1; texture->m_VkFormat = Mapping::FromFormat(texture->m_Format); texture->m_AttachmentImageAspectMask = VK_IMAGE_ASPECT_COLOR_BIT; texture->m_SamplerImageAspectMask = VK_IMAGE_ASPECT_COLOR_BIT; VkImageCreateInfo imageCreateInfo{}; imageCreateInfo.sType = VK_STRUCTURE_TYPE_IMAGE_CREATE_INFO; imageCreateInfo.imageType = VK_IMAGE_TYPE_2D; imageCreateInfo.extent.width = width; imageCreateInfo.extent.height = height; imageCreateInfo.extent.depth = 1; imageCreateInfo.mipLevels = 1; imageCreateInfo.arrayLayers = 1; imageCreateInfo.format = texture->m_VkFormat; imageCreateInfo.samples = Mapping::FromSampleCount(1); imageCreateInfo.tiling = VK_IMAGE_TILING_LINEAR; imageCreateInfo.usage = VK_IMAGE_USAGE_TRANSFER_DST_BIT; imageCreateInfo.sharingMode = VK_SHARING_MODE_EXCLUSIVE; imageCreateInfo.initialLayout = VK_IMAGE_LAYOUT_UNDEFINED; VmaAllocationCreateInfo allocationCreateInfo{}; allocationCreateInfo.flags |= VMA_ALLOCATION_CREATE_HOST_ACCESS_RANDOM_BIT | VMA_ALLOCATION_CREATE_MAPPED_BIT; #ifndef NDEBUG allocationCreateInfo.flags |= VMA_ALLOCATION_CREATE_USER_DATA_COPY_STRING_BIT; allocationCreateInfo.pUserData = const_cast(name); #endif allocationCreateInfo.requiredFlags = VK_MEMORY_PROPERTY_HOST_VISIBLE_BIT | VK_MEMORY_PROPERTY_HOST_COHERENT_BIT; allocationCreateInfo.usage = VMA_MEMORY_USAGE_AUTO_PREFER_HOST; const VkResult createImageResult = vmaCreateImage( device->GetVMAAllocator(), &imageCreateInfo, &allocationCreateInfo, &texture->m_Image, &texture->m_Allocation, &texture->m_AllocationInfo); if (createImageResult != VK_SUCCESS) { LOGERROR("Failed to create VkImage: %d (%s)", static_cast(createImageResult), Utilities::GetVkResultName(createImageResult)); return nullptr; } if (!texture->m_AllocationInfo.pMappedData) { LOGERROR("Failed to map readback image."); return nullptr; } device->SetObjectName(VK_OBJECT_TYPE_IMAGE, texture->m_Image, name); return texture; } CTexture::CTexture(CDevice* device) : m_Device(device), m_UID(device->GenerateNextDeviceObjectUID()) { } CTexture::~CTexture() { if (m_AttachmentImageView != VK_NULL_HANDLE) m_Device->ScheduleObjectToDestroy( VK_OBJECT_TYPE_IMAGE_VIEW, m_AttachmentImageView, VK_NULL_HANDLE); if (m_SamplerImageView != VK_NULL_HANDLE) m_Device->ScheduleObjectToDestroy( VK_OBJECT_TYPE_IMAGE_VIEW, m_SamplerImageView, VK_NULL_HANDLE); if (m_Allocation != VK_NULL_HANDLE) m_Device->ScheduleObjectToDestroy( VK_OBJECT_TYPE_IMAGE, m_Image, m_Allocation); m_Device->ScheduleTextureToDestroy(m_UID); } IDevice* CTexture::GetDevice() { return m_Device; } } // namespace Vulkan } // namespace Backend } // namespace Renderer