Skip to content

Commit

Permalink
Implement async shader compilation plus caching for GL ES 3
Browse files Browse the repository at this point in the history
Async. compilation via ubershader is currently available in the scene and particles shaders only.

Bonus:
- Use `#if defined()` syntax for not true conditionals, so they don't unnecessarily take a bit in the version flagset.
- Remove unused `ENABLE_CLIP_ALPHA` from scene shader.
- Remove unused `PARTICLES_COPY` from the particles shader.
- Remove unused uniform related code.
- Shader language/compiler: use ordered hash maps for deterministic code generation (needed for caching).
  • Loading branch information
RandomShaper committed Nov 9, 2021
1 parent b6f04df commit 4c71078
Show file tree
Hide file tree
Showing 43 changed files with 2,161 additions and 661 deletions.
133 changes: 133 additions & 0 deletions core/threaded_callable_queue.h
Original file line number Diff line number Diff line change
@@ -0,0 +1,133 @@
/*************************************************************************/
/* threaded_callable_queue.h */
/*************************************************************************/
/* This file is part of: */
/* GODOT ENGINE */
/* https://godotengine.org */
/*************************************************************************/
/* Copyright (c) 2007-2021 Juan Linietsky, Ariel Manzur. */
/* Copyright (c) 2014-2021 Godot Engine contributors (cf. AUTHORS.md). */
/* */
/* Permission is hereby granted, free of charge, to any person obtaining */
/* a copy of this software and associated documentation files (the */
/* "Software"), to deal in the Software without restriction, including */
/* without limitation the rights to use, copy, modify, merge, publish, */
/* distribute, sublicense, and/or sell copies of the Software, and to */
/* permit persons to whom the Software is furnished to do so, subject to */
/* the following conditions: */
/* */
/* The above copyright notice and this permission notice shall be */
/* included in all copies or substantial portions of the Software. */
/* */
/* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, */
/* EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF */
/* MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.*/
/* IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY */
/* CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, */
/* TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE */
/* SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE. */
/*************************************************************************/

#ifndef THREADED_CALLABLE_QUEUE_H
#define THREADED_CALLABLE_QUEUE_H

#include "core/local_vector.h"
#include "core/ordered_hash_map.h"
#include "core/os/mutex.h"
#include "core/os/semaphore.h"
#include "core/os/thread.h"

#include <functional>

template <class K>
class ThreadedCallableQueue {
public:
using Job = std::function<void()>;

private:
bool exit;
Thread thread;
BinaryMutex mutex;
Semaphore sem;
OrderedHashMap<K, Job> queue;

static void _thread_func(void *p_user_data);

public:
void enqueue(K p_key, Job p_job);
void cancel(K p_key);

ThreadedCallableQueue();
~ThreadedCallableQueue();
};

template <class K>
void ThreadedCallableQueue<K>::_thread_func(void *p_user_data) {
ThreadedCallableQueue *self = static_cast<ThreadedCallableQueue *>(p_user_data);

while (true) {
self->sem.wait();
self->mutex.lock();
if (self->exit) {
self->mutex.unlock();
break;
}

typename OrderedHashMap<K, Job>::Element E = self->queue.front();
// Defense about implementation bugs (excessive posts)
if (!E) {
ERR_PRINT("Semaphore unlocked, the queue is empty. Bug?");
self->mutex.unlock();
// --- Defense end
} else {
LocalVector<Job> jobs;
jobs.push_back(E.value());
self->queue.erase(E);
self->mutex.unlock();

for (uint32_t i = 0; i < jobs.size(); i++) {
jobs[i]();
}
}
}

self->mutex.lock();
for (typename OrderedHashMap<K, Job>::Element E = self->queue.front(); E; E = E.next()) {
Job job = E.value();
job();
}
self->mutex.unlock();
}

template <class K>
void ThreadedCallableQueue<K>::enqueue(K p_key, Job p_job) {
MutexLock lock(mutex);
ERR_FAIL_COND(exit);
ERR_FAIL_COND(queue.has(p_key));
queue.insert(p_key, p_job);
sem.post();
}

template <class K>
void ThreadedCallableQueue<K>::cancel(K p_key) {
MutexLock lock(mutex);
ERR_FAIL_COND(exit);
if (queue.erase(p_key)) {
sem.wait();
}
}

template <class K>
ThreadedCallableQueue<K>::ThreadedCallableQueue() :
exit(false) {
thread.start(&_thread_func, this);
}

template <class K>
ThreadedCallableQueue<K>::~ThreadedCallableQueue() {
exit = true;
sem.post();
thread.wait_to_finish();
}

#endif // THREADED_CALLABLE_QUEUE_H
33 changes: 33 additions & 0 deletions doc/classes/ProjectSettings.xml
Original file line number Diff line number Diff line change
Expand Up @@ -1222,6 +1222,39 @@
If [code]true[/code] and available on the target Android device, enables high floating point precision for all shader computations in GLES2.
[b]Warning:[/b] High floating point precision can be extremely slow on older devices and is often not available at all. Use with caution.
</member>
<member name="rendering/gles3/shaders/log_active_async_compiles_count" type="bool" setter="" getter="" default="false">
If [code]true[/code], every time an asynchronous shader compilation or an asynchronous shader reconstruction from cache starts or finishes, a line will be logged telling how many of those are happening.
If the platform doesn't support parallel shader compile, but only the compile queue via a secondary GL context, what the message will tell is the number of shader compiles currently queued.
[b]Note:[/b] This setting is only meaningful if [code]rendering/gles3/shaders/shader_compilation_mode[/code] is [b]not[/b] [code]Synchronous[/code].
</member>
<member name="rendering/gles3/shaders/max_simultaneous_compiles" type="int" setter="" getter="" default="2">
This is the maximum number of shaders that can be compiled (or reconstructed from cache) at the same time.
At runtime, while that count is reached, other shaders that can be asynchronously compiled will just use their fallback, without their setup being started until the count gets lower.
This is a way to balance the CPU work between running the game and compiling the shaders. The goal is to have as many asynchronous compiles in flight as possible without impacting the responsiveness of the game, which beyond some point would destroy the benefits of asynchronous compilation. In other words, you may be able to afford that the FPS lowers a bit, and that will already be better than the stalling that synchronous compilation could cause.
The default value is a conservative one, so you are advised to tweak it according to the hardware you are targeting.
[b]Note:[/b] This setting is only meaningful if [code]rendering/gles3/shaders/shader_compilation_mode[/code] is [b]not[/b] [code]Synchronous[/code].
</member>
<member name="rendering/gles3/shaders/max_simultaneous_compiles.mobile" type="int" setter="" getter="" default="1">
The default is a very conservative override for [code]rendering/gles3/shaders/max_concurrent_compiles[/code].
Depending on the specific devices you are targeting, you may want to raise it.
[b]Note:[/b] This setting is only meaningful if [code]rendering/gles3/shaders/shader_compilation_mode[/code] is [b]not[/b] [code]Synchronous[/code].
</member>
<member name="rendering/gles3/shaders/shader_cache_size_mb" type="int" setter="" getter="" default="512">
The maximum size, in megabytes, that the ubershader cache can grow up to. On startup, the least recently used entries will be deleted until the total size is within bounds.
[b]Note:[/b] This setting is only meaningful if [code]rendering/gles3/shaders/shader_compilation_mode[/code] is set to [code]Asynchronous + Cache[/code].
</member>
<member name="rendering/gles3/shaders/shader_cache_size_mb.mobile" type="int" setter="" getter="" default="128">
An override for [code]rendering/gles3/shaders/ubershader_cache_size_mb[/code], so a smaller maximum size can be configured for mobile platforms, where storage space is more limited.
[b]Note:[/b] This setting is only meaningful if [code]rendering/gles3/shaders/shader_compilation_mode[/code] is set to [code]Asynchronous + Cache[/code].
</member>
<member name="rendering/gles3/shaders/shader_compilation_mode" type="int" setter="" getter="" default="0">
If set to [code]Asynchronous[/code] and available on the target device, asynchronous compilation of shaders is enabled (in contrast to [code]Asynchronous[/code]).
That means that when a shader is first used under some new rendering situation, the game won't stall while such shader is being compiled. Instead, a fallback will be used and the real shader will be compiled in the background. Once the actual shader is compiled, it will be used the next times it's used to draw a frame.
Depending on the async mode configured for a given material/shader, the fallback will be an "ubershader" (the default) or just skip rendering any item it is applied to.
An ubershader is a very complex shader, slow but suited to any rendering situation, that the engine generates internally so it can be used from the beginning while the traditional conditioned, optimized version of it is being compiled.
In order to save some loading time, you can use [code]Asynchronous + Cache[/code], which also causes the ubershaders to be cached into storage so they can be ready faster next time they are used (provided the platform provides support for it).
[b]Warning:[/b] Async. compilation is currently only supported for spatial and particle materials/shaders.
</member>
<member name="rendering/limits/buffers/blend_shape_max_buffer_size_kb" type="int" setter="" getter="" default="4096">
Max buffer size for blend shapes. Any blend shape bigger than this will not work.
</member>
Expand Down
11 changes: 11 additions & 0 deletions doc/classes/SpatialMaterial.xml
Original file line number Diff line number Diff line change
Expand Up @@ -87,6 +87,10 @@
<member name="ao_texture_channel" type="int" setter="set_ao_texture_channel" getter="get_ao_texture_channel" enum="SpatialMaterial.TextureChannel">
Specifies the channel of the [member ao_texture] in which the ambient occlusion information is stored. This is useful when you store the information for multiple effects in a single texture. For example if you stored metallic in the red channel, roughness in the blue, and ambient occlusion in the green you could reduce the number of textures you use.
</member>
<member name="async_mode" type="int" setter="set_async_mode" getter="get_async_mode" enum="SpatialMaterial.AsyncMode" default="0">
If [member ProjectSettings.rendering/gles3/shaders/shader_compilation_mode] is [code]Synchronous[/code] (with or without cache), this determines how this material must behave in regards to asynchronous shader compilation.
[constant ASYNC_MODE_VISIBLE] is the default and the best for most cases.
</member>
<member name="clearcoat" type="float" setter="set_clearcoat" getter="get_clearcoat">
Sets the strength of the clearcoat effect. Setting to [code]0[/code] looks the same as disabling the clearcoat effect.
</member>
Expand Down Expand Up @@ -639,5 +643,12 @@
<constant name="DISTANCE_FADE_OBJECT_DITHER" value="3" enum="DistanceFadeMode">
Smoothly fades the object out based on the object's distance from the camera using a dither approach. Dithering discards pixels based on a set pattern to smoothly fade without enabling transparency. On certain hardware this can be faster than [constant DISTANCE_FADE_PIXEL_ALPHA].
</constant>
<constant name="ASYNC_MODE_VISIBLE" value="0" enum="AsyncMode">
The real conditioned shader needed on each situation will be sent for background compilation. In the meantime, a very complex shader that adapts to every situation will be used ("ubershader"). This ubershader is much slower to render, but will keep the game running without stalling to compile. Once shader compilation is done, the ubershader is replaced by the traditional optimized shader.
</constant>
<constant name="ASYNC_MODE_HIDDEN" value="1" enum="AsyncMode">
Anything with this material applied won't be rendered while this material's shader is being compiled.
This is useful for optimization, in cases where the visuals won't suffer from having certain non-essential elements missing during the short time their shaders are being compiled.
</constant>
</constants>
</class>
9 changes: 9 additions & 0 deletions doc/classes/VisualServer.xml
Original file line number Diff line number Diff line change
Expand Up @@ -2525,6 +2525,15 @@
Sets the default clear color which is used when a specific clear color has not been selected.
</description>
</method>
<method name="set_shader_async_hidden_forbidden">
<return type="void" />
<argument index="0" name="forbidden" type="bool" />
<description>
If asynchronous shader compilation is enabled, this controls whether [constant SpatialMaterial.ASYNC_MODE_HIDDEN] is obeyed.
For instance, you may want to enable this temporarily before taking a screenshot. This ensures everything is visible even if shaders with async mode [i]hidden[/i] are not ready yet.
Reflection probes use this internally to ensure they capture everything regardless the shaders are ready or not.
</description>
</method>
<method name="set_shader_time_scale">
<return type="void" />
<argument index="0" name="scale" type="float" />
Expand Down
3 changes: 3 additions & 0 deletions drivers/dummy/rasterizer_dummy.h
Original file line number Diff line number Diff line change
Expand Up @@ -266,6 +266,9 @@ class RasterizerStorageDummy : public RasterizerStorage {
void shader_get_custom_defines(RID p_shader, Vector<String> *p_defines) const {}
void shader_remove_custom_define(RID p_shader, const String &p_define) {}

void set_shader_async_hidden_forbidden(bool p_forbidden) {}
bool is_shader_async_hidden_forbidden() { return false; }

/* COMMON MATERIAL API */

RID material_create() { return RID(); }
Expand Down
3 changes: 3 additions & 0 deletions drivers/gles2/rasterizer_storage_gles2.h
Original file line number Diff line number Diff line change
Expand Up @@ -544,6 +544,9 @@ class RasterizerStorageGLES2 : public RasterizerStorage {
virtual void shader_get_custom_defines(RID p_shader, Vector<String> *p_defines) const;
virtual void shader_remove_custom_define(RID p_shader, const String &p_define);

void set_shader_async_hidden_forbidden(bool p_forbidden) {}
bool is_shader_async_hidden_forbidden() { return false; }

void _update_shader(Shader *p_shader) const;
void update_dirty_shaders();

Expand Down
42 changes: 21 additions & 21 deletions drivers/gles2/shader_compiler_gles2.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -295,8 +295,8 @@ String ShaderCompilerGLES2::_dump_node_code(const SL::Node *p_node, int p_level,
int max_texture_uniforms = 0;
int max_uniforms = 0;

for (Map<StringName, SL::ShaderNode::Uniform>::Element *E = snode->uniforms.front(); E; E = E->next()) {
if (SL::is_sampler_type(E->get().type)) {
for (OrderedHashMap<StringName, SL::ShaderNode::Uniform>::Element E = snode->uniforms.front(); E; E = E.next()) {
if (SL::is_sampler_type(E.get().type)) {
max_texture_uniforms++;
} else {
max_uniforms++;
Expand Down Expand Up @@ -347,55 +347,55 @@ String ShaderCompilerGLES2::_dump_node_code(const SL::Node *p_node, int p_level,

// uniforms

for (Map<StringName, SL::ShaderNode::Uniform>::Element *E = snode->uniforms.front(); E; E = E->next()) {
for (OrderedHashMap<StringName, SL::ShaderNode::Uniform>::Element E = snode->uniforms.front(); E; E = E.next()) {
StringBuffer<> uniform_code;

// use highp if no precision is specified to prevent different default values in fragment and vertex shader
SL::DataPrecision precision = E->get().precision;
if (precision == SL::PRECISION_DEFAULT && E->get().type != SL::TYPE_BOOL) {
SL::DataPrecision precision = E.get().precision;
if (precision == SL::PRECISION_DEFAULT && E.get().type != SL::TYPE_BOOL) {
precision = SL::PRECISION_HIGHP;
}

uniform_code += "uniform ";
uniform_code += _prestr(precision);
uniform_code += _typestr(E->get().type);
uniform_code += _typestr(E.get().type);
uniform_code += " ";
uniform_code += _mkid(E->key());
uniform_code += _mkid(E.key());
uniform_code += ";\n";

if (SL::is_sampler_type(E->get().type)) {
r_gen_code.texture_uniforms.write[E->get().texture_order] = E->key();
r_gen_code.texture_hints.write[E->get().texture_order] = E->get().hint;
if (SL::is_sampler_type(E.get().type)) {
r_gen_code.texture_uniforms.write[E.get().texture_order] = E.key();
r_gen_code.texture_hints.write[E.get().texture_order] = E.get().hint;
} else {
r_gen_code.uniforms.write[E->get().order] = E->key();
r_gen_code.uniforms.write[E.get().order] = E.key();
}

vertex_global += uniform_code.as_string();
fragment_global += uniform_code.as_string();

p_actions.uniforms->insert(E->key(), E->get());
p_actions.uniforms->insert(E.key(), E.get());
}

// varyings

List<Pair<StringName, SL::ShaderNode::Varying>> var_frag_to_light;

for (Map<StringName, SL::ShaderNode::Varying>::Element *E = snode->varyings.front(); E; E = E->next()) {
if (E->get().stage == SL::ShaderNode::Varying::STAGE_FRAGMENT_TO_LIGHT || E->get().stage == SL::ShaderNode::Varying::STAGE_FRAGMENT) {
var_frag_to_light.push_back(Pair<StringName, SL::ShaderNode::Varying>(E->key(), E->get()));
fragment_varyings.insert(E->key());
for (OrderedHashMap<StringName, SL::ShaderNode::Varying>::Element E = snode->varyings.front(); E; E = E.next()) {
if (E.get().stage == SL::ShaderNode::Varying::STAGE_FRAGMENT_TO_LIGHT || E.get().stage == SL::ShaderNode::Varying::STAGE_FRAGMENT) {
var_frag_to_light.push_back(Pair<StringName, SL::ShaderNode::Varying>(E.key(), E.get()));
fragment_varyings.insert(E.key());
continue;
}
StringBuffer<> varying_code;

varying_code += "varying ";
varying_code += _prestr(E->get().precision);
varying_code += _typestr(E->get().type);
varying_code += _prestr(E.get().precision);
varying_code += _typestr(E.get().type);
varying_code += " ";
varying_code += _mkid(E->key());
if (E->get().array_size > 0) {
varying_code += _mkid(E.key());
if (E.get().array_size > 0) {
varying_code += "[";
varying_code += itos(E->get().array_size);
varying_code += itos(E.get().array_size);
varying_code += "]";
}
varying_code += ";\n";
Expand Down
7 changes: 7 additions & 0 deletions drivers/gles3/rasterizer_gles3.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -207,13 +207,16 @@ void RasterizerGLES3::begin_frame(double frame_step) {
storage->frame.time[2] = Math::fmod(time_total, 900);
storage->frame.time[3] = Math::fmod(time_total, 60);
storage->frame.count++;
storage->frame.shader_compiles_started = 0;
storage->frame.delta = frame_step;

storage->update_dirty_resources();

storage->info.render_final = storage->info.render;
storage->info.render.reset();

ShaderGLES3::current_frame = storage->frame.count;

scene->iteration();
}

Expand Down Expand Up @@ -410,6 +413,8 @@ void RasterizerGLES3::end_frame(bool p_swap_buffers) {
}
}

ShaderGLES3::advance_async_shaders_compilation();

if (p_swap_buffers) {
OS::get_singleton()->swap_buffers();
} else {
Expand Down Expand Up @@ -487,6 +492,8 @@ RasterizerGLES3::RasterizerGLES3() {

time_total = 0;
time_scale = 1;

ShaderGLES3::compiles_started_this_frame = &storage->frame.shader_compiles_started;
}

RasterizerGLES3::~RasterizerGLES3() {
Expand Down
Loading

0 comments on commit 4c71078

Please sign in to comment.