Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Meshlet Occlusion Culling #78

Merged
merged 23 commits into from
Oct 25, 2024
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
Show all changes
23 commits
Select commit Hold shift + click to select a range
a223dfc
Start adding meshlet-visibilty/HZB-culling
crocdialer Oct 18, 2024
459e3e8
meshlet visibility-bits in place, passthrough here
crocdialer Oct 18, 2024
b03ccd1
need to account for lods, reserve space, adjust base_indices in gpu_cull
crocdialer Oct 18, 2024
ba909da
small excourse, use only two component of normal-map (required for BC5)
crocdialer Oct 18, 2024
b6f7d0d
rewrite div/modulo with shifts
crocdialer Oct 19, 2024
96fc381
some rewrite in Rasterizer, create descriptor-sets after delegate
crocdialer Oct 19, 2024
ca004a3
some follow-up rework inPBRDeferred, less copying
crocdialer Oct 19, 2024
1c52178
almost there
crocdialer Oct 19, 2024
fe9b01a
tadaa, meshlet occlusion-culling working here
crocdialer Oct 19, 2024
a7c089d
set meshlet-visibility bits atomically
crocdialer Oct 20, 2024
9fe7784
move all meshlet-culling into post-pass
crocdialer Oct 20, 2024
8fd1f3e
noodling with last mile, wip here with some visibility-glitches
crocdialer Oct 20, 2024
10f4db0
meshlet contribution-culling, minor restructuring in shader
crocdialer Oct 20, 2024
d56ac36
some minor changes in shaders, still flickering-issue with meshlet-oc…
crocdialer Oct 21, 2024
d15b26d
workgroup-barrier ftw, now only traces of failed occlusion-culling left
crocdialer Oct 22, 2024
f3253b5
minor changes
crocdialer Oct 22, 2024
04228f9
minor corrections for non-culling meshlet count
crocdialer Oct 22, 2024
9152e24
meshlet-visibility working now
crocdialer Oct 23, 2024
5f44350
swap ballot with atomic
crocdialer Oct 23, 2024
8b22c3c
turn culling-preprocessor flag into constant_id
crocdialer Oct 23, 2024
be0b38a
small hashmap excourse, cough up non-atomic flavour
crocdialer Oct 24, 2024
74e854b
meshlets: fix annoying startup flicker-issue by pre-creating HZB
crocdialer Oct 24, 2024
ba2ba9b
fixed last missing bit, late_visiblity unfucks the interaction with o…
crocdialer Oct 25, 2024
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
19 changes: 13 additions & 6 deletions include/vierkant/Rasterizer.hpp
Original file line number Diff line number Diff line change
Expand Up @@ -24,13 +24,12 @@ using double_millisecond_t = std::chrono::duration<double, std::milli>;
/**
* @brief Rasterizer can be used to run arbitrary rasterization/graphics pipelines.
*
* It will not render anything on its own, only record secondary command-buffers,
* meant to be executed within an existing renderpass.
* It will not submit anything on its own, only record drawing commands into command-buffers.
*
* Required resources like descriptor-sets and uniform-buffers will be created
* and kept alive, depending on the requested number of in-flight (pending) frames.
* Required resources like descriptor-sets and uniform-buffers will be created
* and kept alive, depending on the requested number of in-flight (pending) frames.
*
* Renderer is NOT thread-safe, with the exception of stage_drawables(...).
* Renderer is NOT thread-safe, with the exception of stage_drawables(...).
*/
class Rasterizer
{
Expand All @@ -53,6 +52,8 @@ class Rasterizer
BINDING_MESHLETS = 13,
BINDING_MESHLET_VERTICES = 14,
BINDING_MESHLET_TRIANGLES = 15,
BINDING_MESHLET_VISIBILITY = 16,
BINDING_DEPTH_PYRAMID = 17,
BINDING_MAX_RANGE
};

Expand All @@ -76,16 +77,18 @@ class Rasterizer
vierkant::Mesh::lod_t lods[8];
};

struct indexed_indirect_command_t
struct alignas(16) indexed_indirect_command_t
{
VkDrawIndexedIndirectCommand vk_draw = {};// size: 5

VkDrawMeshTasksIndirectCommandEXT vk_mesh_draw = {};// size: 3

uint32_t visible = false;
uint32_t late_visible = false;
uint32_t object_index = 0;
uint32_t base_meshlet = 0;
uint32_t num_meshlets = 0;
uint32_t meshlet_visibility_index = 0;
uint32_t count_buffer_offset = 0;
uint32_t first_draw_index = 0;
};
Expand All @@ -104,6 +107,9 @@ class Rasterizer
//! device array containing any array of material_t
vierkant::BufferPtr materials;

//! device array a visibility bitfield for all meshlets
vierkant::BufferPtr meshlet_visibilities;

//! host-visible array of indexed_indirect_command_t
vierkant::BufferPtr draws_in;

Expand Down Expand Up @@ -271,6 +277,7 @@ class Rasterizer
vierkant::BufferPtr mesh_draw_buffer;
vierkant::BufferPtr mesh_entry_buffer;
vierkant::BufferPtr material_buffer;
vierkant::BufferPtr meshlet_visibility_buffer;

// host visible keep-alive staging-buffer
vierkant::BufferPtr staging_buffer;
Expand Down
12 changes: 6 additions & 6 deletions include/vierkant/descriptor.hpp
Original file line number Diff line number Diff line change
Expand Up @@ -87,15 +87,15 @@ DescriptorSetLayoutPtr create_descriptor_set_layout(const vierkant::DevicePtr &d
const descriptor_map_t &descriptors);

/**
* @brief Create a shared VkDescriptorSet (DescriptorSetPtr) for a provided DescriptorLayout
* @brief Create a shared VkDescriptorSet (DescriptorSetPtr) for a provided set-layout.
*
* @param device handle for the vierkant::Device to create the DescriptorSet
* @param pool handle for a shared VkDescriptorPool to allocate the DescriptorSet from
* @param layout handle for a shared VkDescriptorSetLayout to use as blueprint
* @param device handle for the vierkant::Device to create the DescriptorSet
* @param pool handle for a shared VkDescriptorPool to allocate the DescriptorSet from
* @param set_layout handle for a VkDescriptorSetLayout
* @return the newly created DescriptorSetPtr
*/
DescriptorSetPtr create_descriptor_set(const vierkant::DevicePtr &device, const DescriptorPoolPtr &pool,
const DescriptorSetLayoutPtr &layout, bool variable_count);
VkDescriptorSetLayout set_layout, bool variable_count);

/**
* @brief Update an existing shared VkDescriptorSet with a provided array of vierkant::descriptor_t.
Expand Down Expand Up @@ -146,7 +146,7 @@ DescriptorSetLayoutPtr find_or_create_set_layout(const vierkant::DevicePtr &devi
* @return a retrieved or newly created, shared VkDescriptorSet.
*/
DescriptorSetPtr find_or_create_descriptor_set(const vierkant::DevicePtr &device,
const DescriptorSetLayoutPtr &set_layout,
VkDescriptorSetLayout set_layout,
const descriptor_map_t &descriptors,
const vierkant::DescriptorPoolPtr &pool, descriptor_set_map_t &last,
descriptor_set_map_t &current, bool variable_count,
Expand Down
17 changes: 15 additions & 2 deletions include/vierkant/gpu_culling.hpp
Original file line number Diff line number Diff line change
Expand Up @@ -35,6 +35,8 @@ struct gpu_cull_params_t
//! limit number of LoDs (0: no limit)
uint32_t max_num_lods = 0;

bool skip_meshlets = false;

VkQueue queue = VK_NULL_HANDLE;
vierkant::semaphore_submit_info_t semaphore_submit_info = {};

Expand Down Expand Up @@ -77,16 +79,27 @@ struct create_depth_pyramid_params_t
* @brief create_gpu_cull_context is a factory to create an opaque gpu_cull_context_ptr.
*
* @param device a provided vierkant::Device.
* @param size context framebuffer-size
* @param pipeline_cache an optional pipeline_cache.
* @return an opaque pointer, owning a gpu_cull_context.
*/
gpu_cull_context_ptr create_gpu_cull_context(const vierkant::DevicePtr &device,
const glm::vec2 &size,
const vierkant::PipelineCachePtr &pipeline_cache = nullptr);

/**
* @brief create_depth_pyramid can be used to create a 'hierarchical z-buffer (hzb)' or 'depth-pyramid'.
* @brief retrieve internally stored 'hierarchical z-buffer (hzb)' / depth-pyramid.
*
* @param context a provided gpu_cull_context_t
* @param params a provided struct with parameters
* @return a vierkant::ImagePtr containing the created depth-pyramid
*/
vierkant::ImagePtr get_depth_pyramid(const vierkant::gpu_cull_context_ptr &context);

/**
* @brief create_depth_pyramid can be used to create a 'hierarchical z-buffer (hzb)' /depth-pyramid.
*
* @param context a provided vierkant::Device.
* @param context a provided gpu_cull_context_t
* @param params a provided struct with parameters
* @return a vierkant::ImagePtr containing the created depth-pyramid
*/
Expand Down
10 changes: 5 additions & 5 deletions include/vierkant/hash.hpp
Original file line number Diff line number Diff line change
Expand Up @@ -4,9 +4,9 @@

#pragma once

#include <functional>
#include <cstring>
#include <cstdint>
#include <cstring>
#include <functional>

namespace vierkant
{
Expand Down Expand Up @@ -90,11 +90,11 @@ static inline uint32_t murmur3_32(const K &key, uint32_t seed)

if constexpr(num_hashes)
{
auto ptr = reinterpret_cast<const uint32_t *>(&key);
auto ptr = reinterpret_cast<const uint32_t *>(&key), end = ptr + num_hashes;

for(uint32_t i = num_hashes; i; i--)
for(; ptr < end; ++ptr)
{
h ^= murmur_32_scramble(ptr[i - 1]);
h ^= murmur_32_scramble(*ptr);
h = (h << 13) | (h >> 19);
h = h * 5 + 0xe6546b64;
}
Expand Down
196 changes: 194 additions & 2 deletions include/vierkant/linear_hashmap.hpp
Original file line number Diff line number Diff line change
Expand Up @@ -41,6 +41,198 @@ class linear_hashmap
clear();
}

[[nodiscard]] inline size_t size() const { return m_num_elements; }

[[nodiscard]] inline size_t capacity() const { return m_capacity; }

[[nodiscard]] inline bool empty() const { return size() == 0; }

inline void clear()
{
m_num_elements = 0;
storage_item_t *ptr = m_storage.get(), *end = ptr + m_capacity;
for(; ptr != end; ++ptr)
{
ptr->key = key_t();
ptr->value = std::optional<value_t>();
}
}

inline uint32_t put(const key_t &key, const value_t &value)
{
check_load_factor();
return internal_put(key, value);
}

[[nodiscard]] std::optional<value_t> get(const key_t &key) const
{
if(!m_capacity) { return {}; }

for(uint32_t idx = m_hash_fn(key);; idx++)
{
idx &= m_capacity - 1;
auto &item = m_storage[idx];
if(item.key == key_t()) { return {}; }
else if(key == item.key)
{
if(item.value) { return item.value; }
}
}
}

void remove(const key_t &key)
{
if(!m_capacity) { return; }

for(uint32_t idx = m_hash_fn(key);; idx++)
{
idx &= m_capacity - 1;
auto &item = m_storage[idx];
if(item.key == key_t()) { return; }
else if(key == item.key && item.value)
{
item.value = {};
m_num_elements--;
return;
}
}
}

[[nodiscard]] inline bool contains(const key_t &key) const { return get(key) != std::nullopt; }

size_t get_storage(void *dst) const
{
struct output_item_t
{
key_t key = {};
value_t value = {};
};

if(dst)
{
auto output_ptr = reinterpret_cast<output_item_t *>(dst);
storage_item_t *item = m_storage.get(), *end = item + m_capacity;
for(; item != end; ++item, ++output_ptr)
{
if(item->key != key_t())
{
output_ptr->key = item->key;
output_ptr->value = item->value ? *item->value : value_t();
}
else { *output_ptr = {}; }
}
}
return sizeof(output_item_t) * m_capacity;
}

void reserve(size_t new_capacity)
{
auto new_linear_hashmap = linear_hashmap(new_capacity);
storage_item_t *ptr = m_storage.get(), *end = ptr + m_capacity;
for(; ptr != end; ++ptr)
{
if(ptr->key != key_t())
{
if(ptr->value) { new_linear_hashmap.put(ptr->key, *ptr->value); }
}
}
swap(*this, new_linear_hashmap);
}

[[nodiscard]] float load_factor() const { return static_cast<float>(m_num_elements) / m_capacity; }

[[nodiscard]] float max_load_factor() const { return m_max_load_factor; }

void max_load_factor(float load_factor)
{
m_max_load_factor = std::clamp<float>(load_factor, 0.01f, 1.f);
check_load_factor();
}

friend void swap(linear_hashmap &lhs, linear_hashmap &rhs)
{
std::swap(lhs.m_capacity, rhs.m_capacity);
std::swap(lhs.m_num_elements, rhs.m_num_elements);
std::swap(lhs.m_storage, rhs.m_storage);
std::swap(lhs.m_hash_fn, rhs.m_hash_fn);
std::swap(lhs.m_max_load_factor, rhs.m_max_load_factor);
std::swap(lhs.m_grow_factor, rhs.m_grow_factor);
}

private:
struct storage_item_t
{
key_t key;
std::optional<value_t> value;
};

inline void check_load_factor()
{
if(m_num_elements >= m_capacity * m_max_load_factor)
{
reserve(std::max<size_t>(32, static_cast<size_t>(m_grow_factor * m_capacity)));
}
}

inline uint32_t internal_put(const key_t key, const value_t &value)
{
uint32_t probe_length = 0;

for(uint64_t idx = m_hash_fn(key);; idx++, probe_length++)
{
idx &= m_capacity - 1;
auto &item = m_storage[idx];

// load previous key
key_t probed_key = item.key;

if(probed_key != key)
{
// hit another valid entry, keep probing
if(probed_key != key_t() && item.value) { continue; }
item.key = key;
m_num_elements++;
}
item.value = value;
return probe_length;
}
}

uint64_t m_capacity = 0;
uint64_t m_num_elements = 0;
std::unique_ptr<storage_item_t[]> m_storage;
hash32_fn m_hash_fn = std::bind(murmur3_32<key_t>, std::placeholders::_1, 0);

// reasonably low load-factor to keep average probe-lengths low
float m_max_load_factor = 0.5f;
float m_grow_factor = 2.f;
};

template<typename K, typename V>
class linear_hashmap_mt
{
public:
using key_t = K;
using value_t = V;
using hash32_fn = std::function<uint32_t(const key_t &)>;
static_assert(std::is_default_constructible_v<key_t>, "key_t not default-constructible");
static_assert(std::equality_comparable<key_t>, "key_t not comparable");

linear_hashmap_mt() = default;
linear_hashmap_mt(const linear_hashmap_mt &) = delete;
linear_hashmap_mt(linear_hashmap_mt &other) : linear_hashmap_mt() { swap(*this, other); };
linear_hashmap_mt &operator=(linear_hashmap_mt other)
{
swap(*this, other);
return *this;
}

explicit linear_hashmap_mt(uint64_t min_capacity)
: m_capacity(crocore::next_pow_2(min_capacity)), m_storage(std::make_unique<storage_item_t[]>(m_capacity))
{
clear();
}

inline size_t size() const { return m_num_elements; }

inline size_t capacity() const { return m_capacity; }
Expand Down Expand Up @@ -133,7 +325,7 @@ class linear_hashmap

void reserve(size_t new_capacity)
{
auto new_linear_hashmap = linear_hashmap(new_capacity);
auto new_linear_hashmap = linear_hashmap_mt(new_capacity);
storage_item_t *ptr = m_storage.get(), *end = ptr + m_capacity;
for(; ptr != end; ++ptr)
{
Expand All @@ -155,7 +347,7 @@ class linear_hashmap
check_load_factor();
}

friend void swap(linear_hashmap &lhs, linear_hashmap &rhs)
friend void swap(linear_hashmap_mt &lhs, linear_hashmap_mt &rhs)
{
std::lock(lhs.m_mutex, rhs.m_mutex);
std::unique_lock lock_lhs(lhs.m_mutex, std::adopt_lock), lock_rhs(rhs.m_mutex, std::adopt_lock);
Expand Down
Loading
Loading