From a223dfc802499e351bf26ef491a109167b5e77d3 Mon Sep 17 00:00:00 2001 From: crocdialer Date: Fri, 18 Oct 2024 10:18:39 +0200 Subject: [PATCH 01/23] Start adding meshlet-visibilty/HZB-culling --- include/vierkant/Rasterizer.hpp | 1 + shaders/pbr/cull_meshlets.task | 8 ++++++++ shaders/renderer/types.glsl | 1 + 3 files changed, 10 insertions(+) diff --git a/include/vierkant/Rasterizer.hpp b/include/vierkant/Rasterizer.hpp index 301daf10..db32a120 100644 --- a/include/vierkant/Rasterizer.hpp +++ b/include/vierkant/Rasterizer.hpp @@ -53,6 +53,7 @@ class Rasterizer BINDING_MESHLETS = 13, BINDING_MESHLET_VERTICES = 14, BINDING_MESHLET_TRIANGLES = 15, + BINDING_MESHLET_VISIBILITY = 16, BINDING_MAX_RANGE }; diff --git a/shaders/pbr/cull_meshlets.task b/shaders/pbr/cull_meshlets.task index 9bab1cbc..2daf63c6 100644 --- a/shaders/pbr/cull_meshlets.task +++ b/shaders/pbr/cull_meshlets.task @@ -1,6 +1,7 @@ #version 460 #extension GL_EXT_mesh_shader : require #extension GL_GOOGLE_include_directive : require +#extension GL_EXT_buffer_reference2: require #extension GL_KHR_shader_subgroup_ballot: require #include "../renderer/types.glsl" @@ -22,6 +23,13 @@ layout(std140, set = 0, binding = BINDING_MESH_DRAWS) readonly buffer MeshDrawBu mesh_draw_t draws[]; }; +// read/write meshlet-visibility buffers +layout(buffer_reference, std430) buffer MeshletVisibilityBufferPtr { uint v[]; }; +layout(binding = BINDING_MESHLET_VISIBILITY, set = 0) readonly buffer MeshletVisibilities +{ + MeshletVisibilityBufferPtr meshlet_visibilities []; +}; + layout(std140, binding = BINDING_JITTER_OFFSET) uniform UBOJitter { camera_t camera; diff --git a/shaders/renderer/types.glsl b/shaders/renderer/types.glsl index 7b3d7255..a24d55d4 100644 --- a/shaders/renderer/types.glsl +++ b/shaders/renderer/types.glsl @@ -133,6 +133,7 @@ struct render_context_t #define BINDING_MESHLETS 13 #define BINDING_MESHLET_VERTICES 14 #define BINDING_MESHLET_TRIANGLES 15 +#define BINDING_MESHLET_VISIBILITY 16 //! combined indirect-draw struct struct indexed_indirect_command_t From 459e3e8710f236c6a511682cf2f9dc7443288b62 Mon Sep 17 00:00:00 2001 From: crocdialer Date: Fri, 18 Oct 2024 14:33:57 +0200 Subject: [PATCH 02/23] meshlet visibility-bits in place, passthrough here --- include/vierkant/Rasterizer.hpp | 14 ++++++------ shaders/pbr/cull_meshlets.task | 30 ++++++++++++++++++++------ shaders/renderer/types.glsl | 3 ++- src/Rasterizer.cpp | 38 ++++++++++++++++++++++++++++++--- 4 files changed, 69 insertions(+), 16 deletions(-) diff --git a/include/vierkant/Rasterizer.hpp b/include/vierkant/Rasterizer.hpp index db32a120..ae74a8e2 100644 --- a/include/vierkant/Rasterizer.hpp +++ b/include/vierkant/Rasterizer.hpp @@ -24,13 +24,12 @@ using double_millisecond_t = std::chrono::duration; /** * @brief Rasterizer can be used to run arbitrary rasterization/graphics pipelines. * - * It will not render anything on its own, only record secondary command-buffers, - * meant to be executed within an existing renderpass. + * It will not submit anything on its own, only record drawing commands into command-buffers. * - * Required resources like descriptor-sets and uniform-buffers will be created - * and kept alive, depending on the requested number of in-flight (pending) frames. + * Required resources like descriptor-sets and uniform-buffers will be created + * and kept alive, depending on the requested number of in-flight (pending) frames. * - * Renderer is NOT thread-safe, with the exception of stage_drawables(...). + * Renderer is NOT thread-safe, with the exception of stage_drawables(...). */ class Rasterizer { @@ -77,7 +76,7 @@ class Rasterizer vierkant::Mesh::lod_t lods[8]; }; - struct indexed_indirect_command_t + struct alignas(16) indexed_indirect_command_t { VkDrawIndexedIndirectCommand vk_draw = {};// size: 5 @@ -87,8 +86,10 @@ class Rasterizer uint32_t object_index = 0; uint32_t base_meshlet = 0; uint32_t num_meshlets = 0; + uint32_t meshlet_visibility_index = 0; uint32_t count_buffer_offset = 0; uint32_t first_draw_index = 0; + uint32_t pad[1]{}; }; struct indirect_draw_bundle_t @@ -272,6 +273,7 @@ class Rasterizer vierkant::BufferPtr mesh_draw_buffer; vierkant::BufferPtr mesh_entry_buffer; vierkant::BufferPtr material_buffer; + vierkant::BufferPtr meshlet_visibility_buffer; // host visible keep-alive staging-buffer vierkant::BufferPtr staging_buffer; diff --git a/shaders/pbr/cull_meshlets.task b/shaders/pbr/cull_meshlets.task index 2daf63c6..a42ed41b 100644 --- a/shaders/pbr/cull_meshlets.task +++ b/shaders/pbr/cull_meshlets.task @@ -23,11 +23,10 @@ layout(std140, set = 0, binding = BINDING_MESH_DRAWS) readonly buffer MeshDrawBu mesh_draw_t draws[]; }; -// read/write meshlet-visibility buffers -layout(buffer_reference, std430) buffer MeshletVisibilityBufferPtr { uint v[]; }; -layout(binding = BINDING_MESHLET_VISIBILITY, set = 0) readonly buffer MeshletVisibilities +// read/write meshlet-visibility buffer +layout(binding = BINDING_MESHLET_VISIBILITY, set = 0) buffer MeshletVisibilities { - MeshletVisibilityBufferPtr meshlet_visibilities []; + uint meshlet_visibilities[]; }; layout(std140, binding = BINDING_JITTER_OFFSET) uniform UBOJitter @@ -57,6 +56,23 @@ bool cone_cull(vec3 center, float radius, vec3 cone_axis, float cone_cutoff, vec return dot(center - camera_position, cone_axis) >= cone_cutoff * length(center - camera_position) + radius; } +bool is_visible(uint visibility_base_index, uint meshlet_index) +{ + return (meshlet_visibilities[visibility_base_index + meshlet_index / 32] & (1 << (meshlet_index & 31U))) != 0; +} + +void set_visible(uint visibility_base_index, uint meshlet_index, bool visible) +{ + if(visible) + { + meshlet_visibilities[visibility_base_index + meshlet_index / 32] |= (1 << (meshlet_index & 31U)); + } + else + { + meshlet_visibilities[visibility_base_index + meshlet_index / 32] &= ~(1 << (meshlet_index & 31U)); + } +} + #define CULLING 1 void main() @@ -76,6 +92,8 @@ void main() task_payload.meshlet_base_index = meshlet_base_index; #if CULLING + bool accept = is_visible(draw_command.meshlet_visibility_index, mi); + // transform bounding volume and normal-cone mat4 m = camera.view * mat4_cast(draws[object_index].current_matrices.transform); vec3 cone_axis = normalize(mat3(m) * meshlets[mi].cone_axis); @@ -84,8 +102,8 @@ void main() float radius = meshlets[mi].sphere_radius * length(m[0].xyz); // backface-culling - bool accept = materials[draws[object_index].material_index].two_sided || - !cone_cull(center, radius, cone_axis, cone_cutoff, vec3(0)); + accept = accept && (materials[draws[object_index].material_index].two_sided || + !cone_cull(center, radius, cone_axis, cone_cutoff, vec3(0))); // frustum-culling accept = accept && !frustum_cull(center, radius, camera.frustum); diff --git a/shaders/renderer/types.glsl b/shaders/renderer/types.glsl index a24d55d4..eb3dad17 100644 --- a/shaders/renderer/types.glsl +++ b/shaders/renderer/types.glsl @@ -150,13 +150,14 @@ struct indexed_indirect_command_t uint groupCountY; uint groupCountZ; - bool visible; uint object_index; uint base_meshlet; uint num_meshlets; + uint meshlet_visibility_index; uint count_buffer_offset; uint first_draw_index; + uint pad; }; //! meshlet parameters diff --git a/src/Rasterizer.cpp b/src/Rasterizer.cpp index c7715012..5f6b0d4f 100644 --- a/src/Rasterizer.cpp +++ b/src/Rasterizer.cpp @@ -7,6 +7,8 @@ namespace vierkant { +inline uint32_t div_up(uint32_t nom, uint32_t denom) { return (nom + denom - 1) / denom; } + using std::chrono::duration_cast; using std::chrono::steady_clock; using duration_t = std::chrono::duration; @@ -348,6 +350,13 @@ void Rasterizer::render(VkCommandBuffer command_buffer, frame_assets_t &frame_as auto &desc_texture = drawable.descriptors[vierkant::Rasterizer::BINDING_TEXTURES]; desc_texture.type = VK_DESCRIPTOR_TYPE_COMBINED_IMAGE_SAMPLER; desc_texture.stage_flags = VK_SHADER_STAGE_FRAGMENT_BIT; + + if(vkCmdDrawMeshTasksEXT && use_mesh_shader && drawable.mesh && drawable.mesh->meshlets) + { + auto &desc_meshlet_vis = drawable.descriptors[Rasterizer::BINDING_MESHLET_VISIBILITY]; + desc_meshlet_vis.type = VK_DESCRIPTOR_TYPE_STORAGE_BUFFER; + desc_meshlet_vis.stage_flags = VK_SHADER_STAGE_TASK_BIT_EXT; + } } // only provide a global texture-array for indirect draws if(indirect_draw) { drawable.descriptors.erase(BINDING_TEXTURES); } @@ -378,6 +387,9 @@ void Rasterizer::render(VkCommandBuffer command_buffer, frame_assets_t &frame_as // batch/pipeline index uint32_t count_buffer_offset = 0; + // meshlet-visibility index + uint32_t meshlet_visibility_index = 0; + // fill up indirect draw buffers for(const auto &[pipe_fmt, indexed_drawables]: pipeline_drawables) { @@ -404,6 +416,12 @@ void Rasterizer::render(VkCommandBuffer command_buffer, frame_assets_t &frame_as drawable->descriptors[BINDING_MESH_DRAWS].buffers = {frame_assets.mesh_draw_buffer}; drawable->descriptors[BINDING_MATERIAL].buffers = {frame_assets.material_buffer}; drawable->descriptors[BINDING_DRAW_COMMANDS].buffers = {draw_buffer_indexed}; + + if(drawable->descriptors.contains(BINDING_MESHLET_VISIBILITY)) + { + drawable->descriptors[BINDING_MESHLET_VISIBILITY].buffers = { + frame_assets.meshlet_visibility_buffer}; + } } auto descriptor_set = vierkant::find_or_create_descriptor_set( @@ -443,9 +461,11 @@ void Rasterizer::render(VkCommandBuffer command_buffer, frame_assets_t &frame_as draw_command->num_meshlets = drawable->num_meshlets; //! VkDrawMeshTasksIndirectCommandEXT - draw_command->vk_mesh_draw.groupCountX = - (m_mesh_task_count + drawable->num_meshlets - 1) / m_mesh_task_count; + draw_command->vk_mesh_draw.groupCountX = div_up(drawable->num_meshlets, m_mesh_task_count); draw_command->vk_mesh_draw.groupCountY = draw_command->vk_mesh_draw.groupCountZ = 1; + + draw_command->meshlet_visibility_index = meshlet_visibility_index; + meshlet_visibility_index += div_up(drawable->num_meshlets, 32); } else { @@ -660,6 +680,9 @@ void Rasterizer::update_buffers(const std::vector &drawables, Raster std::vector mesh_draws(drawables.size()); std::vector material_data; + // joined meshlet-visibilities (1 bit per meshlet) + std::vector meshlet_visibility_data; + for(uint32_t i = 0; i < drawables.size(); i++) { const auto &drawable = drawables[i]; @@ -694,6 +717,12 @@ void Rasterizer::update_buffers(const std::vector &drawables, Raster material_index_map[mat] = material_data.size(); material_data.push_back(drawable.material); } + + // set all meshlet-bits hi/visible for all entry-lods + size_t num_meshlets = 0; + const auto &entry = drawable.mesh->entries[drawable.entry_index]; + for(const auto &lod: entry.lods) { num_meshlets += div_up(lod.num_meshlets, 32); } + meshlet_visibility_data.resize(meshlet_visibility_data.size() + num_meshlets, 0xFFFFFFFF); } else { material_data.push_back(drawable.material); } @@ -775,7 +804,9 @@ void Rasterizer::update_buffers(const std::vector &drawables, Raster VK_ACCESS_2_SHADER_READ_BIT, "Rasterizer: mesh_draws"); add_staging_copy(material_data, frame_asset.material_buffer, VK_PIPELINE_STAGE_2_FRAGMENT_SHADER_BIT, VK_ACCESS_2_SHADER_READ_BIT, "Rasterizer: material_data"); - + add_staging_copy(meshlet_visibility_data, frame_asset.meshlet_visibility_buffer, + VK_PIPELINE_STAGE_2_TASK_SHADER_BIT_EXT, VK_ACCESS_2_SHADER_READ_BIT, + "Rasterizer: meshlet_visibility_data"); vierkant::staging_copy_context_t staging_context = {}; staging_context.command_buffer = frame_asset.staging_command_buffer.handle(); staging_context.staging_buffer = frame_asset.staging_buffer; @@ -790,6 +821,7 @@ void Rasterizer::update_buffers(const std::vector &drawables, Raster copy_to_buffer(mesh_entries, frame_asset.mesh_entry_buffer); copy_to_buffer(mesh_draws, frame_asset.mesh_draw_buffer); copy_to_buffer(material_data, frame_asset.material_buffer); + copy_to_buffer(meshlet_visibility_data, frame_asset.meshlet_visibility_buffer); } frame_asset.indirect_indexed_bundle.mesh_draws = frame_asset.mesh_draw_buffer; From b03ccd1000ea8eef3f3129d7a8c202f27588e96a Mon Sep 17 00:00:00 2001 From: crocdialer Date: Fri, 18 Oct 2024 16:07:11 +0200 Subject: [PATCH 03/23] need to account for lods, reserve space, adjust base_indices in gpu_cull --- shaders/pbr/indirect_cull.comp | 20 +++++++++++++++++--- src/Rasterizer.cpp | 26 +++++++++++++++++--------- src/gpu_culling.cpp | 2 +- 3 files changed, 35 insertions(+), 13 deletions(-) diff --git a/shaders/pbr/indirect_cull.comp b/shaders/pbr/indirect_cull.comp index fdb21b82..e53d66f4 100644 --- a/shaders/pbr/indirect_cull.comp +++ b/shaders/pbr/indirect_cull.comp @@ -51,7 +51,10 @@ struct draw_cull_data_t bool frustum_cull; bool occlusion_cull; bool contribution_cull; - bool backface_cull; + + // meshlet pipelines need to run main & post passes, culling per meshlet + bool meshlet_pipeline; + bool lod_enabled; // buffer references @@ -86,6 +89,8 @@ bool project_sphere(vec3 C, float r, float znear, float P00, float P11, out vec4 return true; } +uint div_up(uint nom, uint denom) { return (nom + denom - 1) / denom; } + layout(binding = 0) uniform sampler2D u_depth_pyramid; layout(std140, binding = 1) uniform culldata_ubo_t{ draw_cull_data_t cull_data; }; @@ -166,10 +171,15 @@ void main() draw.vertexOffset = int(mesh_entry.vertex_offset); draw.indexCount = lod.num_indices; draw.firstIndex = lod.base_index; - draw.groupCountX = (lod.num_meshlets + TASK_WORKGROUP_SIZE - 1) / TASK_WORKGROUP_SIZE; + draw.groupCountX = div_up(lod.num_meshlets, TASK_WORKGROUP_SIZE); draw.groupCountY = draw.groupCountZ = 1; draw.base_meshlet = lod.base_meshlet; draw.num_meshlets = lod.num_meshlets; + + for(uint i = 0; i < lod_index; ++i) + { + draw.meshlet_visibility_index += div_up(mesh_entry.lods[i].num_meshlets, 32); + } } // count managment @@ -180,7 +190,11 @@ void main() atomicAdd(cull_data.draw_result.v.num_meshlets, draw.num_meshlets); // became visible this frame -> not drawn in 1st-pass - if(!cull_data.draws_in.v[gid].visible) + bool needs_post_draw = !cull_data.draws_in.v[gid].visible; + + needs_post_draw = needs_post_draw || (cull_data.meshlet_pipeline && draw.num_meshlets > 0); + + if(needs_post_draw) { uint draw_cmd_offset_post = atomicAdd(cull_data.draw_count_post.v[draw.count_buffer_offset], 1); cull_data.draws_out_post.v[draw.first_draw_index + draw_cmd_offset_post] = draw; diff --git a/src/Rasterizer.cpp b/src/Rasterizer.cpp index 5f6b0d4f..ba777090 100644 --- a/src/Rasterizer.cpp +++ b/src/Rasterizer.cpp @@ -306,11 +306,15 @@ void Rasterizer::render(VkCommandBuffer command_buffer, frame_assets_t &frame_as struct indexed_drawable_t { uint32_t object_index = 0; + uint32_t meshlet_visibility_index = 0; vierkant::DescriptorSetLayoutPtr descriptor_set_layout = nullptr; drawable_t *drawable = nullptr; }; std::unordered_map> pipeline_drawables; + // meshlet-visibility index + uint32_t meshlet_visibility_index = 0; + // preprocess drawables for(uint32_t i = 0; i < frame_assets.drawables.size(); i++) { @@ -370,6 +374,15 @@ void Rasterizer::render(VkCommandBuffer command_buffer, frame_assets_t &frame_as // bindless texture-array pipeline_format.descriptor_set_layouts.push_back(bindless_texture_layout.get()); + if(drawable.mesh && drawable.mesh->entries.size() < drawable.entry_index) + { + indexed_drawable.meshlet_visibility_index = meshlet_visibility_index; + for(const auto &lod: drawable.mesh->entries[drawable.entry_index].lods) + { + meshlet_visibility_index += div_up(lod.num_meshlets, 32); + } + } + // push intermediate struct pipeline_drawables[pipeline_format].push_back(indexed_drawable); } @@ -387,9 +400,6 @@ void Rasterizer::render(VkCommandBuffer command_buffer, frame_assets_t &frame_as // batch/pipeline index uint32_t count_buffer_offset = 0; - // meshlet-visibility index - uint32_t meshlet_visibility_index = 0; - // fill up indirect draw buffers for(const auto &[pipe_fmt, indexed_drawables]: pipeline_drawables) { @@ -463,9 +473,7 @@ void Rasterizer::render(VkCommandBuffer command_buffer, frame_assets_t &frame_as //! VkDrawMeshTasksIndirectCommandEXT draw_command->vk_mesh_draw.groupCountX = div_up(drawable->num_meshlets, m_mesh_task_count); draw_command->vk_mesh_draw.groupCountY = draw_command->vk_mesh_draw.groupCountZ = 1; - - draw_command->meshlet_visibility_index = meshlet_visibility_index; - meshlet_visibility_index += div_up(drawable->num_meshlets, 32); + draw_command->meshlet_visibility_index = indexed_drawable.meshlet_visibility_index; } else { @@ -719,10 +727,10 @@ void Rasterizer::update_buffers(const std::vector &drawables, Raster } // set all meshlet-bits hi/visible for all entry-lods - size_t num_meshlets = 0; + size_t num_array_elems = 0; const auto &entry = drawable.mesh->entries[drawable.entry_index]; - for(const auto &lod: entry.lods) { num_meshlets += div_up(lod.num_meshlets, 32); } - meshlet_visibility_data.resize(meshlet_visibility_data.size() + num_meshlets, 0xFFFFFFFF); + for(const auto &lod: entry.lods) { num_array_elems += div_up(lod.num_meshlets, 32); } + meshlet_visibility_data.resize(meshlet_visibility_data.size() + num_array_elems, 0xFFFFFFFF); } else { material_data.push_back(drawable.material); } diff --git a/src/gpu_culling.cpp b/src/gpu_culling.cpp index 20a6d5b6..1823048c 100644 --- a/src/gpu_culling.cpp +++ b/src/gpu_culling.cpp @@ -50,7 +50,7 @@ struct alignas(16) draw_cull_data_t VkBool32 frustum_cull = false; VkBool32 occlusion_cull = false; VkBool32 contribution_cull = false; - VkBool32 backface_cull = false; + VkBool32 meshlet_pipeline = false; VkBool32 lod_enabled = false; // buffer references From ba909dad004665a1a28e6b0eb1de261ba07753ec Mon Sep 17 00:00:00 2001 From: crocdialer Date: Fri, 18 Oct 2024 16:31:03 +0200 Subject: [PATCH 04/23] small excourse, use only two component of normal-map (required for BC5) --- shaders/pbr/g_buffer_uber.frag | 5 +++-- shaders/ray/closesthit.rchit | 9 +++++---- 2 files changed, 8 insertions(+), 6 deletions(-) diff --git a/shaders/pbr/g_buffer_uber.frag b/shaders/pbr/g_buffer_uber.frag index a7ba0e35..6ae37636 100644 --- a/shaders/pbr/g_buffer_uber.frag +++ b/shaders/pbr/g_buffer_uber.frag @@ -120,8 +120,9 @@ void main() if((material.texture_type_flags & TEXTURE_TYPE_NORMAL) != 0) { uint offset = tex_offset(TEXTURE_TYPE_NORMAL, material.texture_type_flags); - normal = normalize(2.0 * (texture(u_sampler_2D[material.base_texture_index + offset], - vertex_in.tex_coord.xy).xyz - vec3(0.5))); + normal.xy = 2.0 * (texture(u_sampler_2D[material.base_texture_index + offset], + vertex_in.tex_coord.xy).xy - vec2(0.5)); + normal.z = sqrt(1.0 - normal.x * normal.x - normal.y * normal.y); // normal, tangent, bi-tangent vec3 t = normalize(vertex_in.tangent); diff --git a/shaders/ray/closesthit.rchit b/shaders/ray/closesthit.rchit index 7b4907c4..41efeb6b 100644 --- a/shaders/ray/closesthit.rchit +++ b/shaders/ray/closesthit.rchit @@ -291,10 +291,11 @@ void main() v.tangent = normalize(v.tangent); // sample normalmap - vec3 normal = normalize(2.0 * (sample_texture_lod(u_textures[material.normalmap_index], - v.tex_coord, NoV, payload.cone.width, triangle_lod).xyz - - vec3(0.5))); - + vec3 normal; + normal.xy = 2.0 * (sample_texture_lod(u_textures[material.normalmap_index], + v.tex_coord, NoV, payload.cone.width, triangle_lod).xy - vec2(0.5)); + normal.z = sqrt(1.0 - normal.x * normal.x - normal.y * normal.y); + // normal, tangent, bi-tangent vec3 b = normalize(cross(v.normal, v.tangent)); payload.normal = mat3(v.tangent, b, payload.normal) * normal; From b6f7d0dbef14b165ce528093c0f02d5af0b97eab Mon Sep 17 00:00:00 2001 From: crocdialer Date: Sat, 19 Oct 2024 11:54:03 +0200 Subject: [PATCH 05/23] rewrite div/modulo with shifts --- shaders/pbr/cull_meshlets.task | 7 ++++--- shaders/ray/closesthit.rchit | 2 +- 2 files changed, 5 insertions(+), 4 deletions(-) diff --git a/shaders/pbr/cull_meshlets.task b/shaders/pbr/cull_meshlets.task index a42ed41b..c3f74f50 100644 --- a/shaders/pbr/cull_meshlets.task +++ b/shaders/pbr/cull_meshlets.task @@ -58,18 +58,18 @@ bool cone_cull(vec3 center, float radius, vec3 cone_axis, float cone_cutoff, vec bool is_visible(uint visibility_base_index, uint meshlet_index) { - return (meshlet_visibilities[visibility_base_index + meshlet_index / 32] & (1 << (meshlet_index & 31U))) != 0; + return (meshlet_visibilities[visibility_base_index + meshlet_index >> 5] & (1 << (meshlet_index & 31U))) != 0; } void set_visible(uint visibility_base_index, uint meshlet_index, bool visible) { if(visible) { - meshlet_visibilities[visibility_base_index + meshlet_index / 32] |= (1 << (meshlet_index & 31U)); + meshlet_visibilities[visibility_base_index + meshlet_index >> 5] |= (1 << (meshlet_index & 31U)); } else { - meshlet_visibilities[visibility_base_index + meshlet_index / 32] &= ~(1 << (meshlet_index & 31U)); + meshlet_visibilities[visibility_base_index + meshlet_index >> 5] &= ~(1 << (meshlet_index & 31U)); } } @@ -109,6 +109,7 @@ void main() accept = accept && !frustum_cull(center, radius, camera.frustum); // TODO: occlusion-culling / visibility recording +// set_visible(draw_command.meshlet_visibility_index, mi, mi % 2 != 0); // determine indices/count via ballot uvec4 ballot = subgroupBallot(accept); diff --git a/shaders/ray/closesthit.rchit b/shaders/ray/closesthit.rchit index 41efeb6b..66e8e1d0 100644 --- a/shaders/ray/closesthit.rchit +++ b/shaders/ray/closesthit.rchit @@ -295,7 +295,7 @@ void main() normal.xy = 2.0 * (sample_texture_lod(u_textures[material.normalmap_index], v.tex_coord, NoV, payload.cone.width, triangle_lod).xy - vec2(0.5)); normal.z = sqrt(1.0 - normal.x * normal.x - normal.y * normal.y); - + // normal, tangent, bi-tangent vec3 b = normalize(cross(v.normal, v.tangent)); payload.normal = mat3(v.tangent, b, payload.normal) * normal; From 96fc38132831cf78ed15050371428087e7e68646 Mon Sep 17 00:00:00 2001 From: crocdialer Date: Sat, 19 Oct 2024 13:41:51 +0200 Subject: [PATCH 06/23] some rewrite in Rasterizer, create descriptor-sets after delegate --- include/vierkant/descriptor.hpp | 12 ++--- src/Compute.cpp | 4 +- src/Rasterizer.cpp | 84 +++++++++++++++++++-------------- src/RayTracer.cpp | 6 +-- src/descriptor.cpp | 7 ++- 5 files changed, 62 insertions(+), 51 deletions(-) diff --git a/include/vierkant/descriptor.hpp b/include/vierkant/descriptor.hpp index 8f477459..e0a034d2 100644 --- a/include/vierkant/descriptor.hpp +++ b/include/vierkant/descriptor.hpp @@ -87,15 +87,15 @@ DescriptorSetLayoutPtr create_descriptor_set_layout(const vierkant::DevicePtr &d const descriptor_map_t &descriptors); /** - * @brief Create a shared VkDescriptorSet (DescriptorSetPtr) for a provided DescriptorLayout + * @brief Create a shared VkDescriptorSet (DescriptorSetPtr) for a provided set-layout. * - * @param device handle for the vierkant::Device to create the DescriptorSet - * @param pool handle for a shared VkDescriptorPool to allocate the DescriptorSet from - * @param layout handle for a shared VkDescriptorSetLayout to use as blueprint + * @param device handle for the vierkant::Device to create the DescriptorSet + * @param pool handle for a shared VkDescriptorPool to allocate the DescriptorSet from + * @param set_layout handle for a VkDescriptorSetLayout * @return the newly created DescriptorSetPtr */ DescriptorSetPtr create_descriptor_set(const vierkant::DevicePtr &device, const DescriptorPoolPtr &pool, - const DescriptorSetLayoutPtr &layout, bool variable_count); + VkDescriptorSetLayout set_layout, bool variable_count); /** * @brief Update an existing shared VkDescriptorSet with a provided array of vierkant::descriptor_t. @@ -146,7 +146,7 @@ DescriptorSetLayoutPtr find_or_create_set_layout(const vierkant::DevicePtr &devi * @return a retrieved or newly created, shared VkDescriptorSet. */ DescriptorSetPtr find_or_create_descriptor_set(const vierkant::DevicePtr &device, - const DescriptorSetLayoutPtr &set_layout, + VkDescriptorSetLayout set_layout, const descriptor_map_t &descriptors, const vierkant::DescriptorPoolPtr &pool, descriptor_set_map_t &last, descriptor_set_map_t ¤t, bool variable_count, diff --git a/src/Compute.cpp b/src/Compute.cpp index 09758a8b..0e9176bc 100644 --- a/src/Compute.cpp +++ b/src/Compute.cpp @@ -94,8 +94,8 @@ void Compute::dispatch(std::vector computables, VkCommandBuffer co // fetch descriptor set auto descriptor_set = vierkant::find_or_create_descriptor_set( - m_device, set_layout, computable.descriptors, m_descriptor_pool, compute_asset.descriptor_set_cache, - next_descriptor_set_cache, false); + m_device, set_layout.get(), computable.descriptors, m_descriptor_pool, + compute_asset.descriptor_set_cache, next_descriptor_set_cache, false); // update descriptor-set with actual descriptors vierkant::update_descriptor_set(m_device, computable.descriptors, descriptor_set); diff --git a/src/Rasterizer.cpp b/src/Rasterizer.cpp index ba777090..ec73c906 100644 --- a/src/Rasterizer.cpp +++ b/src/Rasterizer.cpp @@ -239,8 +239,11 @@ void Rasterizer::render(VkCommandBuffer command_buffer, frame_assets_t &frame_as uint32_t num_draws = 0; uint32_t first_draw_index = 0; uint32_t first_indexed_draw_index = 0; - std::vector descriptor_set_handles; VkRect2D scissor = {}; + std::vector descriptor_set_handles; + VkDescriptorSetLayout descriptor_set_layout = VK_NULL_HANDLE; + + const drawable_t *drawable = nullptr; }; using draw_batch_t = std::vector>; std::unordered_map pipelines; @@ -387,16 +390,6 @@ void Rasterizer::render(VkCommandBuffer command_buffer, frame_assets_t &frame_as pipeline_drawables[pipeline_format].push_back(indexed_drawable); } - vierkant::BufferPtr draw_buffer = frame_assets.indirect_bundle.draws_in; - vierkant::BufferPtr draw_buffer_indexed = frame_assets.indirect_indexed_bundle.draws_in; - - // hook up GPU frustum/occlusion/distance culling here - if(indirect_draw && draw_indirect_delegate) - { - draw_buffer = frame_assets.indirect_bundle.draws_out; - draw_buffer_indexed = frame_assets.indirect_indexed_bundle.draws_out; - } - // batch/pipeline index uint32_t count_buffer_offset = 0; @@ -418,31 +411,8 @@ void Rasterizer::render(VkCommandBuffer command_buffer, frame_assets_t &frame_as new_draw.first_draw_index = frame_assets.indirect_bundle.num_draws; new_draw.first_indexed_draw_index = frame_assets.indirect_indexed_bundle.num_draws; new_draw.scissor = drawable->pipeline_format.scissor; - - // predefined buffers - if(!drawable->use_own_buffers) - { - drawable->descriptors[BINDING_VERTICES].buffers = {frame_assets.vertex_buffer_refs}; - drawable->descriptors[BINDING_MESH_DRAWS].buffers = {frame_assets.mesh_draw_buffer}; - drawable->descriptors[BINDING_MATERIAL].buffers = {frame_assets.material_buffer}; - drawable->descriptors[BINDING_DRAW_COMMANDS].buffers = {draw_buffer_indexed}; - - if(drawable->descriptors.contains(BINDING_MESHLET_VISIBILITY)) - { - drawable->descriptors[BINDING_MESHLET_VISIBILITY].buffers = { - frame_assets.meshlet_visibility_buffer}; - } - } - - auto descriptor_set = vierkant::find_or_create_descriptor_set( - m_device, indexed_drawable.descriptor_set_layout, drawable->descriptors, m_descriptor_pool, - frame_assets.descriptor_sets, next_descriptor_sets, false); - auto bindless_texture_set = vierkant::find_or_create_descriptor_set( - m_device, bindless_texture_layout, bindless_texture_desc, m_descriptor_pool, - frame_assets.descriptor_sets, next_descriptor_sets, false); - - new_draw.descriptor_set_handles = {descriptor_set.get(), bindless_texture_set.get()}; - + new_draw.drawable = drawable; + new_draw.descriptor_set_layout = indexed_drawable.descriptor_set_layout.get(); indirect_draws.emplace_back(drawable->mesh.get(), std::move(new_draw)); } auto &indirect_draw_asset = indirect_draws.back().second; @@ -488,11 +458,53 @@ void Rasterizer::render(VkCommandBuffer command_buffer, frame_assets_t &frame_as } } + vierkant::BufferPtr draw_buffer = frame_assets.indirect_bundle.draws_in; + vierkant::BufferPtr draw_buffer_indexed = frame_assets.indirect_indexed_bundle.draws_in; + // hook up GPU frustum/occlusion/distance culling here if(indirect_draw && draw_indirect_delegate) { // invoke delegate draw_indirect_delegate(frame_assets.indirect_indexed_bundle); + + // set buffers + draw_buffer = frame_assets.indirect_bundle.draws_out; + draw_buffer_indexed = frame_assets.indirect_indexed_bundle.draws_out; + } + + // set buffer-descriptors after delegate + for(const auto &[pipe_fmt, indexed_drawables]: pipeline_drawables) + { + if(indexed_drawables.empty()) { continue; } + auto &indirect_draws = pipelines[pipe_fmt]; + + for(auto &[mesh, draw_asset]: indirect_draws) + { + auto descriptors = draw_asset.drawable->descriptors; + + // predefined buffers + if(!draw_asset.drawable->use_own_buffers) + { + descriptors[BINDING_VERTICES].buffers = {frame_assets.vertex_buffer_refs}; + descriptors[BINDING_MESH_DRAWS].buffers = {frame_assets.mesh_draw_buffer}; + descriptors[BINDING_MATERIAL].buffers = {frame_assets.material_buffer}; + descriptors[BINDING_DRAW_COMMANDS].buffers = {draw_buffer_indexed}; + + if(descriptors.contains(BINDING_MESHLET_VISIBILITY)) + { + descriptors[BINDING_MESHLET_VISIBILITY].buffers = {frame_assets.meshlet_visibility_buffer}; + } + } + + auto descriptor_set = vierkant::find_or_create_descriptor_set( + m_device, draw_asset.descriptor_set_layout, descriptors, m_descriptor_pool, + frame_assets.descriptor_sets, next_descriptor_sets, false); + auto bindless_texture_set = vierkant::find_or_create_descriptor_set( + m_device, bindless_texture_layout.get(), bindless_texture_desc, m_descriptor_pool, + frame_assets.descriptor_sets, next_descriptor_sets, false); + + draw_asset.descriptor_set_handles = {descriptor_set.get(), bindless_texture_set.get()}; + } } // push constants diff --git a/src/RayTracer.cpp b/src/RayTracer.cpp index 324f80af..329b8ece 100644 --- a/src/RayTracer.cpp +++ b/src/RayTracer.cpp @@ -102,9 +102,9 @@ void RayTracer::trace_rays(tracable_t tracable, VkCommandBuffer commandbuffer) } // fetch descriptor set - auto descriptor_set = vierkant::find_or_create_descriptor_set(m_device, descriptor_set_layout, tracable.descriptors, - m_descriptor_pool, trace_asset.descriptor_set_cache, - next_descriptor_set_cache, false, true); + auto descriptor_set = vierkant::find_or_create_descriptor_set( + m_device, descriptor_set_layout.get(), tracable.descriptors, m_descriptor_pool, + trace_asset.descriptor_set_cache, next_descriptor_set_cache, false, true); // update descriptor-set with actual descriptors vierkant::update_descriptor_set(m_device, tracable.descriptors, descriptor_set); diff --git a/src/descriptor.cpp b/src/descriptor.cpp index ba355be3..6674c470 100644 --- a/src/descriptor.cpp +++ b/src/descriptor.cpp @@ -80,10 +80,9 @@ DescriptorSetLayoutPtr create_descriptor_set_layout(const vierkant::DevicePtr &d /////////////////////////////////////////////////////////////////////////////////////////////////// DescriptorSetPtr create_descriptor_set(const vierkant::DevicePtr &device, const DescriptorPoolPtr &pool, - const DescriptorSetLayoutPtr &layout, bool variable_count) + VkDescriptorSetLayout set_layout, bool variable_count) { VkDescriptorSet descriptor_set; - VkDescriptorSetLayout layout_handle = layout.get(); // max allocatable count uint32_t max_binding = g_max_bindless_resources - 1; @@ -98,7 +97,7 @@ DescriptorSetPtr create_descriptor_set(const vierkant::DevicePtr &device, const alloc_info.pNext = variable_count ? &descriptor_count_allocate_info : nullptr; alloc_info.descriptorPool = pool.get(); alloc_info.descriptorSetCount = 1; - alloc_info.pSetLayouts = &layout_handle; + alloc_info.pSetLayouts = &set_layout; spdlog::trace("create_descriptor_set - variable_count: {}", variable_count); @@ -447,7 +446,7 @@ void update_descriptor_buffer(const vierkant::DevicePtr &device, const Descripto //////////////////////////////////////////////////////////////////////////////////////////////////////////////////////// DescriptorSetPtr find_or_create_descriptor_set(const vierkant::DevicePtr &device, - const DescriptorSetLayoutPtr &set_layout, + VkDescriptorSetLayout set_layout, const descriptor_map_t &descriptors, const vierkant::DescriptorPoolPtr &pool, descriptor_set_map_t &last, descriptor_set_map_t ¤t, bool variable_count, bool relax_reuse) From ca004a3fd7e8ab45257be5abf90e30cc473a585e Mon Sep 17 00:00:00 2001 From: crocdialer Date: Sat, 19 Oct 2024 14:20:05 +0200 Subject: [PATCH 07/23] some follow-up rework inPBRDeferred, less copying --- include/vierkant/Rasterizer.hpp | 3 +++ src/PBRDeferred.cpp | 20 +++++++------------- src/Rasterizer.cpp | 3 ++- 3 files changed, 12 insertions(+), 14 deletions(-) diff --git a/include/vierkant/Rasterizer.hpp b/include/vierkant/Rasterizer.hpp index ae74a8e2..b806c87e 100644 --- a/include/vierkant/Rasterizer.hpp +++ b/include/vierkant/Rasterizer.hpp @@ -106,6 +106,9 @@ class Rasterizer //! device array containing any array of material_t vierkant::BufferPtr materials; + //! device array a visibility bitfield for all meshlets + vierkant::BufferPtr meshlet_visibilies; + //! host-visible array of indexed_indirect_command_t vierkant::BufferPtr draws_in; diff --git a/src/PBRDeferred.cpp b/src/PBRDeferred.cpp index d5b0d529..16902e84 100644 --- a/src/PBRDeferred.cpp +++ b/src/PBRDeferred.cpp @@ -818,10 +818,7 @@ vierkant::Framebuffer &PBRDeferred::geometry_pass(cull_result_t &cull_result) } else if(params.num_draws && !frame_context.dirty_drawable_indices.empty()) { - VkBuffer buffers[] = {params.mesh_draws->handle(), - frame_context.indirect_draw_params_post.mesh_draws->handle()}; - vierkant::barrier(frame_context.cmd_clear.handle(), buffers, 2, src_stage, src_access, src_stage, - src_access); + params.mesh_draws->barrier(frame_context.cmd_clear.handle(), src_stage, src_access, src_stage, src_access); constexpr size_t stride = sizeof(Rasterizer::mesh_draw_t); constexpr size_t staging_stride = 2 * sizeof(matrix_struct_t); @@ -845,13 +842,6 @@ vierkant::Framebuffer &PBRDeferred::geometry_pass(cull_result_t &cull_result) copy_transform.dst_access = VK_ACCESS_2_SHADER_READ_BIT; copy_transform.dst_stage = VK_PIPELINE_STAGE_2_PRE_RASTERIZATION_SHADERS_BIT; copy_transforms.push_back(copy_transform); - - if(frame_context.indirect_draw_params_post.mesh_draws) - { - // extra copy into post-meshdraws, not most elegant way - copy_transform.dst_buffer = frame_context.indirect_draw_params_post.mesh_draws; - copy_transforms.push_back(copy_transform); - } i++; } vierkant::staging_copy(staging_context, copy_transforms); @@ -893,7 +883,11 @@ vierkant::Framebuffer &PBRDeferred::geometry_pass(cull_result_t &cull_result) m_g_renderer_post.draw_indirect_delegate = [this, &frame_context](Rasterizer::indirect_draw_bundle_t ¶ms) { resize_indirect_draw_buffers(params.num_draws, frame_context.indirect_draw_params_post); params.draws_counts_out = frame_context.indirect_draw_params_post.draws_counts_out; - frame_context.indirect_draw_params_post.mesh_draws = params.mesh_draws; + + // re-use mesh-draws/transforms/visibilities from main-pass + params.mesh_draws = frame_context.indirect_draw_params_main.mesh_draws; + params.mesh_entries = frame_context.indirect_draw_params_main.mesh_entries; + params.meshlet_visibilies = frame_context.indirect_draw_params_main.meshlet_visibilies; // populate gpu-culling params vierkant::gpu_cull_params_t gpu_cull_params = {}; @@ -914,7 +908,7 @@ vierkant::Framebuffer &PBRDeferred::geometry_pass(cull_result_t &cull_result) gpu_cull_params.draws_out_pre = frame_context.indirect_draw_params_main.draws_out; gpu_cull_params.draws_counts_out_pre = frame_context.indirect_draw_params_main.draws_counts_out; gpu_cull_params.draws_out_post = params.draws_out; - gpu_cull_params.draws_counts_out_post = frame_context.indirect_draw_params_post.draws_counts_out; + gpu_cull_params.draws_counts_out_post = params.draws_counts_out; gpu_cull_params.semaphore_submit_info.semaphore = frame_context.timeline.handle(); gpu_cull_params.semaphore_submit_info.wait_value = diff --git a/src/Rasterizer.cpp b/src/Rasterizer.cpp index ec73c906..7e3e4f5d 100644 --- a/src/Rasterizer.cpp +++ b/src/Rasterizer.cpp @@ -845,8 +845,9 @@ void Rasterizer::update_buffers(const std::vector &drawables, Raster } frame_asset.indirect_indexed_bundle.mesh_draws = frame_asset.mesh_draw_buffer; - frame_asset.indirect_indexed_bundle.materials = frame_asset.material_buffer; frame_asset.indirect_indexed_bundle.mesh_entries = frame_asset.mesh_entry_buffer; + frame_asset.indirect_indexed_bundle.materials = frame_asset.material_buffer; + frame_asset.indirect_indexed_bundle.meshlet_visibilies = frame_asset.meshlet_visibility_buffer; } //////////////////////////////////////////////////////////////////////////////////////////////////////////////////////// From 1c521783ae523c8245553ebfb53135ac71e83d70 Mon Sep 17 00:00:00 2001 From: crocdialer Date: Sat, 19 Oct 2024 17:54:01 +0200 Subject: [PATCH 08/23] almost there --- include/vierkant/Rasterizer.hpp | 1 + include/vierkant/gpu_culling.hpp | 2 ++ shaders/pbr/cull_meshlets.task | 31 ++++++++++++++++++++++++++++--- shaders/pbr/indirect_cull.comp | 30 +++++------------------------- shaders/renderer/types.glsl | 1 + shaders/utils/project_sphere.glsl | 25 +++++++++++++++++++++++++ src/PBRDeferred.cpp | 21 ++++++++++++++++++--- src/gpu_culling.cpp | 3 ++- 8 files changed, 82 insertions(+), 32 deletions(-) create mode 100644 shaders/utils/project_sphere.glsl diff --git a/include/vierkant/Rasterizer.hpp b/include/vierkant/Rasterizer.hpp index b806c87e..c03ce3ba 100644 --- a/include/vierkant/Rasterizer.hpp +++ b/include/vierkant/Rasterizer.hpp @@ -53,6 +53,7 @@ class Rasterizer BINDING_MESHLET_VERTICES = 14, BINDING_MESHLET_TRIANGLES = 15, BINDING_MESHLET_VISIBILITY = 16, + BINDING_DEPTH_PYRAMID = 17, BINDING_MAX_RANGE }; diff --git a/include/vierkant/gpu_culling.hpp b/include/vierkant/gpu_culling.hpp index 53a31cd4..23003534 100644 --- a/include/vierkant/gpu_culling.hpp +++ b/include/vierkant/gpu_culling.hpp @@ -35,6 +35,8 @@ struct gpu_cull_params_t //! limit number of LoDs (0: no limit) uint32_t max_num_lods = 0; + bool skip_meshlets = false; + VkQueue queue = VK_NULL_HANDLE; vierkant::semaphore_submit_info_t semaphore_submit_info = {}; diff --git a/shaders/pbr/cull_meshlets.task b/shaders/pbr/cull_meshlets.task index c3f74f50..fba5ce7a 100644 --- a/shaders/pbr/cull_meshlets.task +++ b/shaders/pbr/cull_meshlets.task @@ -7,6 +7,10 @@ #include "../renderer/types.glsl" #include "../renderer/mesh_task_payload.glsl" #include "../utils/camera.glsl" +#include "../utils/project_sphere.glsl" + +//! specialization constant for main/post pass +layout (constant_id = 2) const bool post_pass = false; layout(set = 0, binding = BINDING_DRAW_COMMANDS) readonly buffer DrawBuffer { @@ -40,6 +44,8 @@ layout(std140, set = 0, binding = BINDING_MATERIAL) readonly buffer MaterialBuff material_struct_t materials[]; }; +layout(set = 0, binding = BINDING_DEPTH_PYRAMID) uniform sampler2D u_depth_pyramid; + layout(push_constant) uniform PushConstants { render_context_t context; @@ -101,15 +107,34 @@ void main() vec3 center = (m * vec4(meshlets[mi].sphere_center, 1.0)).xyz; float radius = meshlets[mi].sphere_radius * length(m[0].xyz); - // backface-culling + // cone-culling accept = accept && (materials[draws[object_index].material_index].two_sided || !cone_cull(center, radius, cone_axis, cone_cutoff, vec3(0))); // frustum-culling accept = accept && !frustum_cull(center, radius, camera.frustum); - // TODO: occlusion-culling / visibility recording -// set_visible(draw_command.meshlet_visibility_index, mi, mi % 2 != 0); + // occlusion-culling / visibility recording + if(post_pass) + { + vec4 aabb; + bool sphere_visible = project_sphere(center, radius, camera.near, camera.projection[0][0], + camera.projection[1][1], aabb); + // cluster bound area in NDC + vec2 screen_area = aabb.zw - aabb.xy; + vec2 pyramid_size = textureSize(u_depth_pyramid, 0); + float width = screen_area.x * pyramid_size.x; + float height = screen_area.y * pyramid_size.y; + float level = floor(log2(max(width, height))); + + // Sampler is set up to do min reduction, so this computes the minimum depth of a 2x2 texel quad + float depth = textureLod(u_depth_pyramid, (aabb.xy + aabb.zw) * 0.5, level).x; + float depth_sphere = camera.near / (-center.z - radius); + + accept = accept && (depth_sphere >= depth); + set_visible(draw_command.meshlet_visibility_index, mi, mi % 2 != 0); + // if(!visible){ atomicAdd(cull_data.draw_result.v.num_occlusion_culled, 1); } + } // determine indices/count via ballot uvec4 ballot = subgroupBallot(accept); diff --git a/shaders/pbr/indirect_cull.comp b/shaders/pbr/indirect_cull.comp index e53d66f4..08788b92 100644 --- a/shaders/pbr/indirect_cull.comp +++ b/shaders/pbr/indirect_cull.comp @@ -7,6 +7,7 @@ #include "../renderer/types.glsl" #include "../renderer/mesh_task_payload.glsl" #include "../utils/camera.glsl" +#include "../utils/project_sphere.glsl" struct draw_cull_result_t { @@ -52,8 +53,8 @@ struct draw_cull_data_t bool occlusion_cull; bool contribution_cull; - // meshlet pipelines need to run main & post passes, culling per meshlet - bool meshlet_pipeline; + // do not cull objects cntaining meshlets + bool skip_meshlets; bool lod_enabled; @@ -68,27 +69,6 @@ struct draw_cull_data_t DrawResultPtr draw_result; }; -// 2D Polyhedral Bounds of a Clipped, Perspective-Projected 3D Sphere. Michael Mara, Morgan McGuire. 2013 -bool project_sphere(vec3 C, float r, float znear, float P00, float P11, out vec4 aabb) -{ - if (-C.z < r + znear){ return false; } - - vec2 cx = vec2(-C.x, C.z); - vec2 vx = vec2(sqrt(dot(cx, cx) - r * r), r); - vec2 minx = mat2(vx.x, vx.y, -vx.y, vx.x) * cx; - vec2 maxx = mat2(vx.x, -vx.y, vx.y, vx.x) * cx; - - vec2 cy = C.yz; - vec2 vy = vec2(sqrt(dot(cy, cy) - r * r), r); - vec2 miny = mat2(vy.x, vy.y, -vy.y, vy.x) * cy; - vec2 maxy = mat2(vy.x, -vy.y, vy.y, vy.x) * cy; - - aabb = vec4(minx.x / minx.y * P00, miny.x / miny.y * P11, maxx.x / maxx.y * P00, maxy.x / maxy.y * P11); - aabb = aabb * vec4(0.5f, -0.5f, 0.5f, -0.5f) + vec4(0.5f); // clip space -> uv space - - return true; -} - uint div_up(uint nom, uint denom) { return (nom + denom - 1) / denom; } layout(binding = 0) uniform sampler2D u_depth_pyramid; @@ -120,7 +100,7 @@ void main() bool sphere_visible = project_sphere(center, radius, cull_data.znear, cull_data.P00, cull_data.P11, aabb); // object bound's area in NDC - vec2 screen_area = aabb.zw - aabb.xy;//vec2(aabb.z - aabb.x, aabb.w - aabb.y); + vec2 screen_area = aabb.zw - aabb.xy; if(cull_data.frustum_cull) { @@ -192,7 +172,7 @@ void main() // became visible this frame -> not drawn in 1st-pass bool needs_post_draw = !cull_data.draws_in.v[gid].visible; - needs_post_draw = needs_post_draw || (cull_data.meshlet_pipeline && draw.num_meshlets > 0); + needs_post_draw = needs_post_draw || (cull_data.skip_meshlets && draw.num_meshlets > 0); if(needs_post_draw) { diff --git a/shaders/renderer/types.glsl b/shaders/renderer/types.glsl index eb3dad17..7094e495 100644 --- a/shaders/renderer/types.glsl +++ b/shaders/renderer/types.glsl @@ -134,6 +134,7 @@ struct render_context_t #define BINDING_MESHLET_VERTICES 14 #define BINDING_MESHLET_TRIANGLES 15 #define BINDING_MESHLET_VISIBILITY 16 +#define BINDING_DEPTH_PYRAMID 17 //! combined indirect-draw struct struct indexed_indirect_command_t diff --git a/shaders/utils/project_sphere.glsl b/shaders/utils/project_sphere.glsl new file mode 100644 index 00000000..1d96f744 --- /dev/null +++ b/shaders/utils/project_sphere.glsl @@ -0,0 +1,25 @@ +#ifndef UTILS_PROJECT_SPHERE_GLSL +#define UTILS_PROJECT_SPHERE_GLSL + +// 2D Polyhedral Bounds of a Clipped, Perspective-Projected 3D Sphere. Michael Mara, Morgan McGuire. 2013 +bool project_sphere(vec3 C, float r, float znear, float P00, float P11, out vec4 aabb) +{ + if (-C.z < r + znear){ return false; } + + vec2 cx = vec2(-C.x, C.z); + vec2 vx = vec2(sqrt(dot(cx, cx) - r * r), r); + vec2 minx = mat2(vx.x, vx.y, -vx.y, vx.x) * cx; + vec2 maxx = mat2(vx.x, -vx.y, vx.y, vx.x) * cx; + + vec2 cy = C.yz; + vec2 vy = vec2(sqrt(dot(cy, cy) - r * r), r); + vec2 miny = mat2(vy.x, vy.y, -vy.y, vy.x) * cy; + vec2 maxy = mat2(vy.x, -vy.y, vy.y, vy.x) * cy; + + aabb = vec4(minx.x / minx.y * P00, miny.x / miny.y * P11, maxx.x / maxx.y * P00, maxy.x / maxy.y * P11); + aabb = aabb * vec4(0.5f, -0.5f, 0.5f, -0.5f) + vec4(0.5f); // clip space -> uv space + + return true; +} + +#endif //UTILS_PROJECT_SPHERE_GLSL \ No newline at end of file diff --git a/src/PBRDeferred.cpp b/src/PBRDeferred.cpp index 16902e84..46712651 100644 --- a/src/PBRDeferred.cpp +++ b/src/PBRDeferred.cpp @@ -729,6 +729,11 @@ vierkant::Framebuffer &PBRDeferred::geometry_pass(cull_result_t &cull_result) pipeline_specialization.set(0, mesh_shader_props.maxPreferredTaskWorkGroupInvocations); pipeline_specialization.set(1, mesh_shader_props.maxPreferredMeshWorkGroupInvocations); drawable.pipeline_format.specialization = std::move(pipeline_specialization); + + auto &desc_depth_pyramid = drawable.descriptors[Rasterizer::BINDING_DEPTH_PYRAMID]; + desc_depth_pyramid.type = VK_DESCRIPTOR_TYPE_COMBINED_IMAGE_SAMPLER; + desc_depth_pyramid.stage_flags = VK_SHADER_STAGE_TASK_BIT_EXT; + desc_depth_pyramid.images = {frame_context.depth_pyramid}; } // check if morph-targets are available @@ -758,10 +763,19 @@ vierkant::Framebuffer &PBRDeferred::geometry_pass(cull_result_t &cull_result) // add descriptor for a jitter-offset drawable.descriptors[Rasterizer::BINDING_JITTER_OFFSET] = camera_desc; + + // stage drawables + m_g_renderer_main.stage_drawable(drawable); + if(use_gpu_culling) + { + if(drawable.descriptors.contains(Rasterizer::BINDING_DEPTH_PYRAMID) && + drawable.pipeline_format.specialization) + { + drawable.pipeline_format.specialization->set(2, VK_TRUE); + } + m_g_renderer_post.stage_drawable(drawable); + } } - // stage drawables - m_g_renderer_main.stage_drawables(cull_result.drawables); - if(use_gpu_culling) { m_g_renderer_post.stage_drawables(cull_result.drawables); } } // apply current settings for both renderers @@ -900,6 +914,7 @@ vierkant::Framebuffer &PBRDeferred::geometry_pass(cull_result_t &cull_result) gpu_cull_params.frustum_cull = frame_context.settings.frustum_culling; gpu_cull_params.occlusion_cull = frame_context.settings.occlusion_culling; gpu_cull_params.lod_enabled = frame_context.settings.enable_lod; +// gpu_cull_params.skip_meshlets = frame_context.settings.use_meshlet_pipeline; gpu_cull_params.depth_pyramid = frame_context.depth_pyramid; gpu_cull_params.draws_in = frame_context.indirect_draw_params_main.draws_in; gpu_cull_params.mesh_draws_in = frame_context.indirect_draw_params_main.mesh_draws; diff --git a/src/gpu_culling.cpp b/src/gpu_culling.cpp index 1823048c..e2cd9303 100644 --- a/src/gpu_culling.cpp +++ b/src/gpu_culling.cpp @@ -50,7 +50,7 @@ struct alignas(16) draw_cull_data_t VkBool32 frustum_cull = false; VkBool32 occlusion_cull = false; VkBool32 contribution_cull = false; - VkBool32 meshlet_pipeline = false; + VkBool32 skip_meshlets = false; VkBool32 lod_enabled = false; // buffer references @@ -222,6 +222,7 @@ draw_cull_result_t gpu_cull(const vierkant::gpu_cull_context_ptr &context, const draw_cull_data.contribution_cull = params.contribution_cull; draw_cull_data.frustum_cull = params.frustum_cull; draw_cull_data.lod_enabled = params.lod_enabled; + draw_cull_data.skip_meshlets = params.skip_meshlets; // buffer references draw_cull_data.draw_commands_in = params.draws_in->device_address(); From fe9b01a872c15797056a71fa16299db98b8eaa9d Mon Sep 17 00:00:00 2001 From: crocdialer Date: Sat, 19 Oct 2024 18:28:31 +0200 Subject: [PATCH 09/23] tadaa, meshlet occlusion-culling working here --- shaders/pbr/cull_meshlets.task | 22 ++++++++------ shaders/pbr/indirect_cull.comp | 2 +- src/PBRDeferred.cpp | 2 +- src/Rasterizer.cpp | 4 +-- tests/TestMesh.cpp | 52 ++++++++++++++-------------------- 5 files changed, 39 insertions(+), 43 deletions(-) diff --git a/shaders/pbr/cull_meshlets.task b/shaders/pbr/cull_meshlets.task index fba5ce7a..d0129b2d 100644 --- a/shaders/pbr/cull_meshlets.task +++ b/shaders/pbr/cull_meshlets.task @@ -98,7 +98,6 @@ void main() task_payload.meshlet_base_index = meshlet_base_index; #if CULLING - bool accept = is_visible(draw_command.meshlet_visibility_index, mi); // transform bounding volume and normal-cone mat4 m = camera.view * mat4_cast(draws[object_index].current_matrices.transform); @@ -108,11 +107,11 @@ void main() float radius = meshlets[mi].sphere_radius * length(m[0].xyz); // cone-culling - accept = accept && (materials[draws[object_index].material_index].two_sided || - !cone_cull(center, radius, cone_axis, cone_cutoff, vec3(0))); + bool visible = materials[draws[object_index].material_index].two_sided || + !cone_cull(center, radius, cone_axis, cone_cutoff, vec3(0)); // frustum-culling - accept = accept && !frustum_cull(center, radius, camera.frustum); + visible = visible && !frustum_cull(center, radius, camera.frustum); // occlusion-culling / visibility recording if(post_pass) @@ -131,18 +130,25 @@ void main() float depth = textureLod(u_depth_pyramid, (aabb.xy + aabb.zw) * 0.5, level).x; float depth_sphere = camera.near / (-center.z - radius); - accept = accept && (depth_sphere >= depth); - set_visible(draw_command.meshlet_visibility_index, mi, mi % 2 != 0); + visible = visible && (depth_sphere >= depth); + + // became visible this frame -> not drawn in 1st-pass + visible = visible && !is_visible(draw_command.meshlet_visibility_index, mi); + // if(!visible){ atomicAdd(cull_data.draw_result.v.num_occlusion_culled, 1); } + + // update visiblity + set_visible(draw_command.meshlet_visibility_index, mi, visible); } + else{ visible = visible && is_visible(draw_command.meshlet_visibility_index, mi); } // determine indices/count via ballot - uvec4 ballot = subgroupBallot(accept); + uvec4 ballot = subgroupBallot(visible); uint index = subgroupBallotExclusiveBitCount(ballot); uint count = subgroupBallotBitCount(ballot); // write payload delta-index - if (accept){ task_payload.meshlet_delta_indices[index] = uint8_t(ti); } + if (visible){ task_payload.meshlet_delta_indices[index] = uint8_t(ti); } EmitMeshTasksEXT(count, 1, 1); #else diff --git a/shaders/pbr/indirect_cull.comp b/shaders/pbr/indirect_cull.comp index 08788b92..50ce2991 100644 --- a/shaders/pbr/indirect_cull.comp +++ b/shaders/pbr/indirect_cull.comp @@ -184,6 +184,6 @@ void main() cull_data.draws_out_pre.v[draw.first_draw_index + draw_cmd_offset] = draw; } - // record visiblity + // update visiblity cull_data.draws_in.v[gid].visible = visible; } \ No newline at end of file diff --git a/src/PBRDeferred.cpp b/src/PBRDeferred.cpp index 46712651..e83757c2 100644 --- a/src/PBRDeferred.cpp +++ b/src/PBRDeferred.cpp @@ -914,7 +914,7 @@ vierkant::Framebuffer &PBRDeferred::geometry_pass(cull_result_t &cull_result) gpu_cull_params.frustum_cull = frame_context.settings.frustum_culling; gpu_cull_params.occlusion_cull = frame_context.settings.occlusion_culling; gpu_cull_params.lod_enabled = frame_context.settings.enable_lod; -// gpu_cull_params.skip_meshlets = frame_context.settings.use_meshlet_pipeline; + gpu_cull_params.skip_meshlets = frame_context.settings.use_meshlet_pipeline; gpu_cull_params.depth_pyramid = frame_context.depth_pyramid; gpu_cull_params.draws_in = frame_context.indirect_draw_params_main.draws_in; gpu_cull_params.mesh_draws_in = frame_context.indirect_draw_params_main.mesh_draws; diff --git a/src/Rasterizer.cpp b/src/Rasterizer.cpp index 7e3e4f5d..1d765eef 100644 --- a/src/Rasterizer.cpp +++ b/src/Rasterizer.cpp @@ -486,8 +486,8 @@ void Rasterizer::render(VkCommandBuffer command_buffer, frame_assets_t &frame_as if(!draw_asset.drawable->use_own_buffers) { descriptors[BINDING_VERTICES].buffers = {frame_assets.vertex_buffer_refs}; - descriptors[BINDING_MESH_DRAWS].buffers = {frame_assets.mesh_draw_buffer}; - descriptors[BINDING_MATERIAL].buffers = {frame_assets.material_buffer}; + descriptors[BINDING_MESH_DRAWS].buffers = {frame_assets.indirect_indexed_bundle.mesh_draws}; + descriptors[BINDING_MATERIAL].buffers = {frame_assets.indirect_indexed_bundle.materials}; descriptors[BINDING_DRAW_COMMANDS].buffers = {draw_buffer_indexed}; if(descriptors.contains(BINDING_MESHLET_VISIBILITY)) diff --git a/tests/TestMesh.cpp b/tests/TestMesh.cpp index 213e0f2b..3befa392 100644 --- a/tests/TestMesh.cpp +++ b/tests/TestMesh.cpp @@ -15,36 +15,28 @@ struct UniformBuffer glm::mat4 projection; }; -const std::vector vertices = - { - {{-0.5f, -0.5f, 0.f}, {1.0f, 0.0f, 0.0f, 1.f}, {0.f, 0.f}}, - {{-0.5f, 0.5f, 0.f}, {1.0f, 1.0f, 1.0f, 1.f}, {0.f, 1.f}}, - {{0.5f, 0.5f, 0.f}, {0.0f, 0.0f, 1.0f, 1.f}, {1.f, 1.f}}, - {{0.5f, -0.5f, 0.f}, {0.0f, 1.0f, 0.0f, 1.f}, {1.f, 0.f}}, - - {{-0.5f, -0.5f, -0.5f}, {1.0f, 0.0f, 0.0f, 1.f}, {0.f, 0.f}}, - {{-0.5f, 0.5f, -0.5f}, {1.0f, 1.0f, 1.0f, 1.f}, {0.f, 1.f}}, - {{0.5f, 0.5f, -0.5f}, {0.0f, 0.0f, 1.0f, 1.f}, {1.f, 1.f}}, - {{0.5f, -0.5f, -0.5f}, {0.0f, 1.0f, 0.0f, 1.f}, {1.f, 0.f}} - }; - -const std::vector indices = - { - 0, 1, 2, 0, 2, 3, - 4, 5, 6, 4, 6, 7 - }; +const std::vector vertices = {{{-0.5f, -0.5f, 0.f}, {1.0f, 0.0f, 0.0f, 1.f}, {0.f, 0.f}}, + {{-0.5f, 0.5f, 0.f}, {1.0f, 1.0f, 1.0f, 1.f}, {0.f, 1.f}}, + {{0.5f, 0.5f, 0.f}, {0.0f, 0.0f, 1.0f, 1.f}, {1.f, 1.f}}, + {{0.5f, -0.5f, 0.f}, {0.0f, 1.0f, 0.0f, 1.f}, {1.f, 0.f}}, + + {{-0.5f, -0.5f, -0.5f}, {1.0f, 0.0f, 0.0f, 1.f}, {0.f, 0.f}}, + {{-0.5f, 0.5f, -0.5f}, {1.0f, 1.0f, 1.0f, 1.f}, {0.f, 1.f}}, + {{0.5f, 0.5f, -0.5f}, {0.0f, 0.0f, 1.0f, 1.f}, {1.f, 1.f}}, + {{0.5f, -0.5f, -0.5f}, {0.0f, 1.0f, 0.0f, 1.f}, {1.f, 0.f}}}; + +const std::vector indices = {0, 1, 2, 0, 2, 3, 4, 5, 6, 4, 6, 7}; /////////////////////////////////////////////////////////////////////////////////////////////////// -vierkant::MeshPtr create_mesh(const vierkant::DevicePtr &device, - const std::vector &vertices, +vierkant::MeshPtr create_mesh(const vierkant::DevicePtr &device, const std::vector &vertices, const std::vector &indices) { auto ret = vierkant::Mesh::create(); // vertex attributes - auto vertex_buffer = vierkant::Buffer::create(device, vertices, VK_BUFFER_USAGE_VERTEX_BUFFER_BIT, - VMA_MEMORY_USAGE_GPU_ONLY); + auto vertex_buffer = + vierkant::Buffer::create(device, vertices, VK_BUFFER_USAGE_VERTEX_BUFFER_BIT, VMA_MEMORY_USAGE_GPU_ONLY); vierkant::vertex_attrib_t position, color, tex_coord; position.offset = offsetof(Vertex, position); @@ -65,8 +57,8 @@ vierkant::MeshPtr create_mesh(const vierkant::DevicePtr &device, tex_coord.format = vierkant::format(); ret->vertex_attribs[2] = tex_coord; - ret->index_buffer = vierkant::Buffer::create(device, indices, VK_BUFFER_USAGE_INDEX_BUFFER_BIT, - VMA_MEMORY_USAGE_GPU_ONLY); + ret->index_buffer = + vierkant::Buffer::create(device, indices, VK_BUFFER_USAGE_INDEX_BUFFER_BIT, VMA_MEMORY_USAGE_GPU_ONLY); return ret; } @@ -74,8 +66,7 @@ vierkant::descriptor_map_t create_descriptors(const vierkant::DevicePtr &device) { // host visible, empty uniform-buffer auto uniform_buffer = vierkant::Buffer::create(device, nullptr, sizeof(UniformBuffer), - VK_BUFFER_USAGE_UNIFORM_BUFFER_BIT, - VMA_MEMORY_USAGE_CPU_ONLY); + VK_BUFFER_USAGE_UNIFORM_BUFFER_BIT, VMA_MEMORY_USAGE_CPU_ONLY); // fill Uniformbuffer auto ubo = static_cast(uniform_buffer->map()); ubo->model = glm::mat4(1); @@ -96,8 +87,7 @@ vierkant::descriptor_map_t create_descriptors(const vierkant::DevicePtr &device) desc_texture.stage_flags = VK_SHADER_STAGE_FRAGMENT_BIT; desc_texture.images = {texture}; - return {{0, desc_ubo}, - {1, desc_texture}}; + return {{0, desc_ubo}, {1, desc_texture}}; } TEST(Mesh, Constructor) @@ -117,17 +107,17 @@ TEST(Mesh, basic) auto descriptor_set_layout = vierkant::create_descriptor_set_layout(test_context.device, descriptors); // construct a pool to hold enough descriptors for the mesh - vierkant::descriptor_count_t descriptor_counts = {{VK_DESCRIPTOR_TYPE_UNIFORM_BUFFER, 1}, + vierkant::descriptor_count_t descriptor_counts = {{VK_DESCRIPTOR_TYPE_UNIFORM_BUFFER, 1}, {VK_DESCRIPTOR_TYPE_COMBINED_IMAGE_SAMPLER, 1}}; auto pool = vierkant::create_descriptor_pool(test_context.device, descriptor_counts, 16); // use the pool to allocate the actual descriptor-set - auto descriptor_set = vierkant::create_descriptor_set(test_context.device, pool, descriptor_set_layout, false); + auto descriptor_set = + vierkant::create_descriptor_set(test_context.device, pool, descriptor_set_layout.get(), false); // update the descriptor set vierkant::update_descriptor_set(test_context.device, descriptors, descriptor_set); - } TEST(Mesh, Format) From a7c089d1421be06ba82233302c9f7654f79ba15b Mon Sep 17 00:00:00 2001 From: crocdialer Date: Sun, 20 Oct 2024 10:13:52 +0200 Subject: [PATCH 10/23] set meshlet-visibility bits atomically --- shaders/pbr/cull_meshlets.task | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/shaders/pbr/cull_meshlets.task b/shaders/pbr/cull_meshlets.task index d0129b2d..3f6c1e28 100644 --- a/shaders/pbr/cull_meshlets.task +++ b/shaders/pbr/cull_meshlets.task @@ -71,11 +71,11 @@ void set_visible(uint visibility_base_index, uint meshlet_index, bool visible) { if(visible) { - meshlet_visibilities[visibility_base_index + meshlet_index >> 5] |= (1 << (meshlet_index & 31U)); + atomicOr(meshlet_visibilities[visibility_base_index + meshlet_index >> 5], 1 << (meshlet_index & 31U)); } else { - meshlet_visibilities[visibility_base_index + meshlet_index >> 5] &= ~(1 << (meshlet_index & 31U)); + atomicAnd(meshlet_visibilities[visibility_base_index + meshlet_index >> 5], ~(1 << (meshlet_index & 31U))); } } @@ -158,4 +158,4 @@ void main() uint count = gl_WorkGroupSize.x; EmitMeshTasksEXT(count, 1, 1); #endif -} \ No newline at end of file +} From 9fe778434b8897a99cd313e82477df0c7b90db2d Mon Sep 17 00:00:00 2001 From: crocdialer Date: Sun, 20 Oct 2024 11:57:11 +0200 Subject: [PATCH 11/23] move all meshlet-culling into post-pass --- shaders/pbr/cull_meshlets.task | 28 +++++++++++++++------------- 1 file changed, 15 insertions(+), 13 deletions(-) diff --git a/shaders/pbr/cull_meshlets.task b/shaders/pbr/cull_meshlets.task index 3f6c1e28..7bf0c4e2 100644 --- a/shaders/pbr/cull_meshlets.task +++ b/shaders/pbr/cull_meshlets.task @@ -99,23 +99,25 @@ void main() #if CULLING - // transform bounding volume and normal-cone - mat4 m = camera.view * mat4_cast(draws[object_index].current_matrices.transform); - vec3 cone_axis = normalize(mat3(m) * meshlets[mi].cone_axis); - float cone_cutoff = meshlets[mi].cone_cutoff; - vec3 center = (m * vec4(meshlets[mi].sphere_center, 1.0)).xyz; - float radius = meshlets[mi].sphere_radius * length(m[0].xyz); - - // cone-culling - bool visible = materials[draws[object_index].material_index].two_sided || - !cone_cull(center, radius, cone_axis, cone_cutoff, vec3(0)); - - // frustum-culling - visible = visible && !frustum_cull(center, radius, camera.frustum); + bool visible = true; // occlusion-culling / visibility recording if(post_pass) { + // transform bounding volume and normal-cone + mat4 m = camera.view * mat4_cast(draws[object_index].current_matrices.transform); + vec3 cone_axis = normalize(mat3(m) * meshlets[mi].cone_axis); + float cone_cutoff = meshlets[mi].cone_cutoff; + vec3 center = (m * vec4(meshlets[mi].sphere_center, 1.0)).xyz; + float radius = meshlets[mi].sphere_radius * length(m[0].xyz); + + // cone-culling + visible = materials[draws[object_index].material_index].two_sided || + !cone_cull(center, radius, cone_axis, cone_cutoff, vec3(0)); + + // frustum-culling + visible = visible && !frustum_cull(center, radius, camera.frustum); + vec4 aabb; bool sphere_visible = project_sphere(center, radius, camera.near, camera.projection[0][0], camera.projection[1][1], aabb); From 8fd1f3e1b77f5ce97dcb659095e04ed29afc8867 Mon Sep 17 00:00:00 2001 From: crocdialer Date: Sun, 20 Oct 2024 14:40:04 +0200 Subject: [PATCH 12/23] noodling with last mile, wip here with some visibility-glitches --- include/vierkant/Rasterizer.hpp | 2 +- shaders/pbr/cull_meshlets.task | 12 +++++++----- src/PBRDeferred.cpp | 4 +++- src/Rasterizer.cpp | 10 ++++++---- 4 files changed, 17 insertions(+), 11 deletions(-) diff --git a/include/vierkant/Rasterizer.hpp b/include/vierkant/Rasterizer.hpp index c03ce3ba..632b2eca 100644 --- a/include/vierkant/Rasterizer.hpp +++ b/include/vierkant/Rasterizer.hpp @@ -108,7 +108,7 @@ class Rasterizer vierkant::BufferPtr materials; //! device array a visibility bitfield for all meshlets - vierkant::BufferPtr meshlet_visibilies; + vierkant::BufferPtr meshlet_visibilities; //! host-visible array of indexed_indirect_command_t vierkant::BufferPtr draws_in; diff --git a/shaders/pbr/cull_meshlets.task b/shaders/pbr/cull_meshlets.task index 7bf0c4e2..1b555a6d 100644 --- a/shaders/pbr/cull_meshlets.task +++ b/shaders/pbr/cull_meshlets.task @@ -64,18 +64,18 @@ bool cone_cull(vec3 center, float radius, vec3 cone_axis, float cone_cutoff, vec bool is_visible(uint visibility_base_index, uint meshlet_index) { - return (meshlet_visibilities[visibility_base_index + meshlet_index >> 5] & (1 << (meshlet_index & 31U))) != 0; + return (meshlet_visibilities[visibility_base_index + (meshlet_index >> 5)] & (1 << (meshlet_index & 31U))) != 0; } void set_visible(uint visibility_base_index, uint meshlet_index, bool visible) { if(visible) { - atomicOr(meshlet_visibilities[visibility_base_index + meshlet_index >> 5], 1 << (meshlet_index & 31U)); + atomicOr(meshlet_visibilities[visibility_base_index + (meshlet_index >> 5)], 1 << (meshlet_index & 31U)); } else { - atomicAnd(meshlet_visibilities[visibility_base_index + meshlet_index >> 5], ~(1 << (meshlet_index & 31U))); + atomicAnd(meshlet_visibilities[visibility_base_index + (meshlet_index >> 5)], ~(1 << (meshlet_index & 31U))); } } @@ -135,12 +135,14 @@ void main() visible = visible && (depth_sphere >= depth); // became visible this frame -> not drawn in 1st-pass - visible = visible && !is_visible(draw_command.meshlet_visibility_index, mi); - + bool needs_post_draw = visible && !is_visible(draw_command.meshlet_visibility_index, mi); // if(!visible){ atomicAdd(cull_data.draw_result.v.num_occlusion_culled, 1); } // update visiblity set_visible(draw_command.meshlet_visibility_index, mi, visible); + + // post_draw + visible = needs_post_draw; } else{ visible = visible && is_visible(draw_command.meshlet_visibility_index, mi); } diff --git a/src/PBRDeferred.cpp b/src/PBRDeferred.cpp index e83757c2..d2ef9c52 100644 --- a/src/PBRDeferred.cpp +++ b/src/PBRDeferred.cpp @@ -771,6 +771,7 @@ vierkant::Framebuffer &PBRDeferred::geometry_pass(cull_result_t &cull_result) if(drawable.descriptors.contains(Rasterizer::BINDING_DEPTH_PYRAMID) && drawable.pipeline_format.specialization) { + //layout (constant_id = 2) const bool post_pass drawable.pipeline_format.specialization->set(2, VK_TRUE); } m_g_renderer_post.stage_drawable(drawable); @@ -798,6 +799,7 @@ vierkant::Framebuffer &PBRDeferred::geometry_pass(cull_result_t &cull_result) frame_context.indirect_draw_params_main.draws_out = params.draws_out; frame_context.indirect_draw_params_main.mesh_draws = params.mesh_draws; frame_context.indirect_draw_params_main.mesh_entries = params.mesh_entries; + frame_context.indirect_draw_params_main.meshlet_visibilities = params.meshlet_visibilities; vierkant::staging_copy_context_t staging_context = {}; staging_context.staging_buffer = frame_context.staging_main; @@ -901,7 +903,7 @@ vierkant::Framebuffer &PBRDeferred::geometry_pass(cull_result_t &cull_result) // re-use mesh-draws/transforms/visibilities from main-pass params.mesh_draws = frame_context.indirect_draw_params_main.mesh_draws; params.mesh_entries = frame_context.indirect_draw_params_main.mesh_entries; - params.meshlet_visibilies = frame_context.indirect_draw_params_main.meshlet_visibilies; + params.meshlet_visibilities = frame_context.indirect_draw_params_main.meshlet_visibilities; // populate gpu-culling params vierkant::gpu_cull_params_t gpu_cull_params = {}; diff --git a/src/Rasterizer.cpp b/src/Rasterizer.cpp index 1d765eef..79f0db16 100644 --- a/src/Rasterizer.cpp +++ b/src/Rasterizer.cpp @@ -492,7 +492,8 @@ void Rasterizer::render(VkCommandBuffer command_buffer, frame_assets_t &frame_as if(descriptors.contains(BINDING_MESHLET_VISIBILITY)) { - descriptors[BINDING_MESHLET_VISIBILITY].buffers = {frame_assets.meshlet_visibility_buffer}; + descriptors[BINDING_MESHLET_VISIBILITY].buffers = { + frame_assets.indirect_indexed_bundle.meshlet_visibilities}; } } @@ -738,11 +739,12 @@ void Rasterizer::update_buffers(const std::vector &drawables, Raster material_data.push_back(drawable.material); } - // set all meshlet-bits hi/visible for all entry-lods + // set visibility-bits low/hi for all lods size_t num_array_elems = 0; + uint32_t vis = indirect_draw ? 0 : 0xFFFFFFFF; const auto &entry = drawable.mesh->entries[drawable.entry_index]; for(const auto &lod: entry.lods) { num_array_elems += div_up(lod.num_meshlets, 32); } - meshlet_visibility_data.resize(meshlet_visibility_data.size() + num_array_elems, 0xFFFFFFFF); + meshlet_visibility_data.resize(meshlet_visibility_data.size() + num_array_elems, vis); } else { material_data.push_back(drawable.material); } @@ -847,7 +849,7 @@ void Rasterizer::update_buffers(const std::vector &drawables, Raster frame_asset.indirect_indexed_bundle.mesh_draws = frame_asset.mesh_draw_buffer; frame_asset.indirect_indexed_bundle.mesh_entries = frame_asset.mesh_entry_buffer; frame_asset.indirect_indexed_bundle.materials = frame_asset.material_buffer; - frame_asset.indirect_indexed_bundle.meshlet_visibilies = frame_asset.meshlet_visibility_buffer; + frame_asset.indirect_indexed_bundle.meshlet_visibilities = frame_asset.meshlet_visibility_buffer; } //////////////////////////////////////////////////////////////////////////////////////////////////////////////////////// From 10f4db0d9a3dbbb203ac5f1750f7b7bb7932266f Mon Sep 17 00:00:00 2001 From: crocdialer Date: Sun, 20 Oct 2024 16:03:43 +0200 Subject: [PATCH 13/23] meshlet contribution-culling, minor restructuring in shader --- shaders/pbr/cull_meshlets.task | 29 +++++++++++++++++++++-------- 1 file changed, 21 insertions(+), 8 deletions(-) diff --git a/shaders/pbr/cull_meshlets.task b/shaders/pbr/cull_meshlets.task index 1b555a6d..50eb099d 100644 --- a/shaders/pbr/cull_meshlets.task +++ b/shaders/pbr/cull_meshlets.task @@ -124,15 +124,28 @@ void main() // cluster bound area in NDC vec2 screen_area = aabb.zw - aabb.xy; vec2 pyramid_size = textureSize(u_depth_pyramid, 0); - float width = screen_area.x * pyramid_size.x; - float height = screen_area.y * pyramid_size.y; - float level = floor(log2(max(width, height))); - // Sampler is set up to do min reduction, so this computes the minimum depth of a 2x2 texel quad - float depth = textureLod(u_depth_pyramid, (aabb.xy + aabb.zw) * 0.5, level).x; - float depth_sphere = camera.near / (-center.z - radius); - - visible = visible && (depth_sphere >= depth); + // contribution-culling + if(visible && sphere_visible) + { + // contribution cull (based on screen-area threshold, ~1 px) + const float size_thresh = 1.0 / pyramid_size.x; + visible = max(screen_area.x, screen_area.y) >= size_thresh; + } + + // occlusion-culling + if(visible && sphere_visible) + { + float width = screen_area.x * pyramid_size.x; + float height = screen_area.y * pyramid_size.y; + float level = floor(log2(max(width, height))); + + // Sampler is set up to do min reduction, so this computes the minimum depth of a 2x2 texel quad + float depth = textureLod(u_depth_pyramid, (aabb.xy + aabb.zw) * 0.5, level).x; + float depth_sphere = camera.near / (-center.z - radius); + + visible = depth_sphere >= depth; + } // became visible this frame -> not drawn in 1st-pass bool needs_post_draw = visible && !is_visible(draw_command.meshlet_visibility_index, mi); From d56ac361bf242262f2577ee91cddbc4b1d0ebdf4 Mon Sep 17 00:00:00 2001 From: crocdialer Date: Mon, 21 Oct 2024 18:26:21 +0200 Subject: [PATCH 14/23] some minor changes in shaders, still flickering-issue with meshlet-occlusion --- shaders/pbr/cull_meshlets.task | 32 +++++++++++++++++--------------- shaders/pbr/g_buffer.mesh | 14 ++++---------- shaders/pbr/g_buffer.vert | 5 +---- shaders/pbr/indirect_cull.comp | 6 +++--- 4 files changed, 25 insertions(+), 32 deletions(-) diff --git a/shaders/pbr/cull_meshlets.task b/shaders/pbr/cull_meshlets.task index 50eb099d..2cd4a53d 100644 --- a/shaders/pbr/cull_meshlets.task +++ b/shaders/pbr/cull_meshlets.task @@ -64,7 +64,7 @@ bool cone_cull(vec3 center, float radius, vec3 cone_axis, float cone_cutoff, vec bool is_visible(uint visibility_base_index, uint meshlet_index) { - return (meshlet_visibilities[visibility_base_index + (meshlet_index >> 5)] & (1 << (meshlet_index & 31U))) != 0; + return (meshlet_visibilities[visibility_base_index + (meshlet_index >> 5)] & (1 << (meshlet_index & 31U))) != 0; } void set_visible(uint visibility_base_index, uint meshlet_index, bool visible) @@ -109,7 +109,9 @@ void main() vec3 cone_axis = normalize(mat3(m) * meshlets[mi].cone_axis); float cone_cutoff = meshlets[mi].cone_cutoff; vec3 center = (m * vec4(meshlets[mi].sphere_center, 1.0)).xyz; - float radius = meshlets[mi].sphere_radius * length(m[0].xyz); + float radius = meshlets[mi].sphere_radius * max(max(abs(draws[object_index].current_matrices.transform.scale_x), + abs(draws[object_index].current_matrices.transform.scale_y)), + abs(draws[object_index].current_matrices.transform.scale_z)); // cone-culling visible = materials[draws[object_index].material_index].two_sided || @@ -133,19 +135,19 @@ void main() visible = max(screen_area.x, screen_area.y) >= size_thresh; } - // occlusion-culling - if(visible && sphere_visible) - { - float width = screen_area.x * pyramid_size.x; - float height = screen_area.y * pyramid_size.y; - float level = floor(log2(max(width, height))); - - // Sampler is set up to do min reduction, so this computes the minimum depth of a 2x2 texel quad - float depth = textureLod(u_depth_pyramid, (aabb.xy + aabb.zw) * 0.5, level).x; - float depth_sphere = camera.near / (-center.z - radius); - - visible = depth_sphere >= depth; - } +// // occlusion-culling +// if(visible && sphere_visible) +// { +// float width = screen_area.x * pyramid_size.x; +// float height = screen_area.y * pyramid_size.y; +// float level = floor(log2(max(width, height))); +// +// // Sampler is set up to do min reduction, so this computes the minimum depth of a 2x2 texel quad +// float depth = camera.near / textureLod(u_depth_pyramid, (aabb.xy + aabb.zw) * 0.5, level).x; +// float depth_sphere = -center.z - radius; +// +// visible = depth_sphere <= depth; +// } // became visible this frame -> not drawn in 1st-pass bool needs_post_draw = visible && !is_visible(draw_command.meshlet_visibility_index, mi); diff --git a/shaders/pbr/g_buffer.mesh b/shaders/pbr/g_buffer.mesh index a25cd594..2bbd8ef9 100644 --- a/shaders/pbr/g_buffer.mesh +++ b/shaders/pbr/g_buffer.mesh @@ -10,10 +10,11 @@ #extension GL_EXT_shader_16bit_storage: require #extension GL_EXT_shader_explicit_arithmetic_types: require -#include "../renderer/mesh_task_payload.glsl" +#include "g_buffer_vertex_data.glsl" #include "../renderer/types.glsl" -#include "../utils/camera.glsl" +#include "../renderer/mesh_task_payload.glsl" #include "../utils/packed_vertex.glsl" +#include "../utils/camera.glsl" // TODO: not really worth it? not sure ... #define CULLING 0 @@ -59,14 +60,7 @@ layout(push_constant) uniform PushConstants taskPayloadSharedEXT mesh_task_payload_t task_payload; layout(location = LOCATION_INDEX_BUNDLE) flat out index_bundle_t indices[]; -layout(location = LOCATION_VERTEX_BUNDLE) out VertexData -{ - vec2 tex_coord; - vec3 normal; - vec3 tangent; - vec4 current_position; - vec4 last_position; -} vertex_out[]; +layout(location = LOCATION_VERTEX_BUNDLE) out g_buffer_vertex_data_t vertex_out[]; #if CULLING shared vec3 vertex_clip[MESHLET_MAX_VERTICES]; diff --git a/shaders/pbr/g_buffer.vert b/shaders/pbr/g_buffer.vert index 50cb10d3..0f2dafb8 100644 --- a/shaders/pbr/g_buffer.vert +++ b/shaders/pbr/g_buffer.vert @@ -30,10 +30,7 @@ layout(push_constant) uniform PushConstants }; layout(location = LOCATION_INDEX_BUNDLE) flat out index_bundle_t indices; -layout(location = LOCATION_VERTEX_BUNDLE) out VertexData -{ - g_buffer_vertex_data_t vertex_out; -}; +layout(location = LOCATION_VERTEX_BUNDLE) out g_buffer_vertex_data_t vertex_out; void main() { diff --git a/shaders/pbr/indirect_cull.comp b/shaders/pbr/indirect_cull.comp index 50ce2991..9373fc47 100644 --- a/shaders/pbr/indirect_cull.comp +++ b/shaders/pbr/indirect_cull.comp @@ -128,10 +128,10 @@ void main() float level = floor(log2(max(width, height))); // Sampler is set up to do min reduction, so this computes the minimum depth of a 2x2 texel quad - float depth = textureLod(u_depth_pyramid, (aabb.xy + aabb.zw) * 0.5, level).x; - float depth_sphere = cull_data.znear / (-center.z - radius); + float depth = cull_data.znear / textureLod(u_depth_pyramid, (aabb.xy + aabb.zw) * 0.5, level).x; + float depth_sphere = -center.z - radius; - visible = visible && (depth_sphere >= depth); + visible = depth_sphere <= depth; if(!visible){ atomicAdd(cull_data.draw_result.v.num_occlusion_culled, 1); } } From d15b26dc90bc79e8a3f2bb6729036933f8cff7aa Mon Sep 17 00:00:00 2001 From: crocdialer Date: Tue, 22 Oct 2024 08:05:01 +0200 Subject: [PATCH 15/23] workgroup-barrier ftw, now only traces of failed occlusion-culling left --- shaders/pbr/cull_meshlets.task | 45 +++++++++++++++++++--------------- shaders/pbr/indirect_cull.comp | 3 ++- 2 files changed, 27 insertions(+), 21 deletions(-) diff --git a/shaders/pbr/cull_meshlets.task b/shaders/pbr/cull_meshlets.task index 2cd4a53d..2b0ca06b 100644 --- a/shaders/pbr/cull_meshlets.task +++ b/shaders/pbr/cull_meshlets.task @@ -100,21 +100,25 @@ void main() #if CULLING bool visible = true; + bool visible_last_frame = is_visible(draw_command.meshlet_visibility_index, mi); + barrier(); // occlusion-culling / visibility recording if(post_pass) { + mesh_draw_t draw = draws[object_index]; + // transform bounding volume and normal-cone - mat4 m = camera.view * mat4_cast(draws[object_index].current_matrices.transform); + mat4 m = camera.view * mat4_cast(draw.current_matrices.transform); vec3 cone_axis = normalize(mat3(m) * meshlets[mi].cone_axis); float cone_cutoff = meshlets[mi].cone_cutoff; vec3 center = (m * vec4(meshlets[mi].sphere_center, 1.0)).xyz; - float radius = meshlets[mi].sphere_radius * max(max(abs(draws[object_index].current_matrices.transform.scale_x), - abs(draws[object_index].current_matrices.transform.scale_y)), - abs(draws[object_index].current_matrices.transform.scale_z)); + float radius = meshlets[mi].sphere_radius * max(max(abs(draw.current_matrices.transform.scale_x), + abs(draw.current_matrices.transform.scale_y)), + abs(draw.current_matrices.transform.scale_z)); // cone-culling - visible = materials[draws[object_index].material_index].two_sided || + visible = materials[draw.material_index].two_sided || !cone_cull(center, radius, cone_axis, cone_cutoff, vec3(0)); // frustum-culling @@ -135,22 +139,23 @@ void main() visible = max(screen_area.x, screen_area.y) >= size_thresh; } -// // occlusion-culling -// if(visible && sphere_visible) -// { -// float width = screen_area.x * pyramid_size.x; -// float height = screen_area.y * pyramid_size.y; -// float level = floor(log2(max(width, height))); -// -// // Sampler is set up to do min reduction, so this computes the minimum depth of a 2x2 texel quad -// float depth = camera.near / textureLod(u_depth_pyramid, (aabb.xy + aabb.zw) * 0.5, level).x; -// float depth_sphere = -center.z - radius; -// -// visible = depth_sphere <= depth; -// } + // occlusion-culling + if(visible && sphere_visible) + { + float width = screen_area.x * pyramid_size.x; + float height = screen_area.y * pyramid_size.y; + float level = floor(log2(max(width, height))); + + // Sampler is set up to do min reduction, so this computes the minimum depth of a 2x2 texel quad + float depth = clamp(camera.near / textureLod(u_depth_pyramid, (aabb.xy + aabb.zw) * 0.5, level).x, + camera.near, camera.far); + float depth_sphere = -center.z - radius; + + visible = depth_sphere <= depth; + } // became visible this frame -> not drawn in 1st-pass - bool needs_post_draw = visible && !is_visible(draw_command.meshlet_visibility_index, mi); + bool needs_post_draw = visible && !visible_last_frame; // if(!visible){ atomicAdd(cull_data.draw_result.v.num_occlusion_culled, 1); } // update visiblity @@ -159,7 +164,7 @@ void main() // post_draw visible = needs_post_draw; } - else{ visible = visible && is_visible(draw_command.meshlet_visibility_index, mi); } + else{ visible = visible_last_frame; } // determine indices/count via ballot uvec4 ballot = subgroupBallot(visible); diff --git a/shaders/pbr/indirect_cull.comp b/shaders/pbr/indirect_cull.comp index 9373fc47..460a36a8 100644 --- a/shaders/pbr/indirect_cull.comp +++ b/shaders/pbr/indirect_cull.comp @@ -128,7 +128,8 @@ void main() float level = floor(log2(max(width, height))); // Sampler is set up to do min reduction, so this computes the minimum depth of a 2x2 texel quad - float depth = cull_data.znear / textureLod(u_depth_pyramid, (aabb.xy + aabb.zw) * 0.5, level).x; + float depth = clamp(cull_data.znear / textureLod(u_depth_pyramid, (aabb.xy + aabb.zw) * 0.5, level).x, + cull_data.znear, cull_data.zfar); float depth_sphere = -center.z - radius; visible = depth_sphere <= depth; From f3253b5414b426a58e7c4d191ef5139aece67c0d Mon Sep 17 00:00:00 2001 From: crocdialer Date: Tue, 22 Oct 2024 12:47:55 +0200 Subject: [PATCH 16/23] minor changes --- include/vierkant/gpu_culling.hpp | 13 +++++++++++-- shaders/pbr/cull_meshlets.task | 4 ++-- src/PBRDeferred.cpp | 2 +- src/RayTracer.cpp | 7 ++----- src/gpu_culling.cpp | 5 +++++ 5 files changed, 21 insertions(+), 10 deletions(-) diff --git a/include/vierkant/gpu_culling.hpp b/include/vierkant/gpu_culling.hpp index 23003534..47f142b5 100644 --- a/include/vierkant/gpu_culling.hpp +++ b/include/vierkant/gpu_culling.hpp @@ -86,9 +86,18 @@ gpu_cull_context_ptr create_gpu_cull_context(const vierkant::DevicePtr &device, const vierkant::PipelineCachePtr &pipeline_cache = nullptr); /** - * @brief create_depth_pyramid can be used to create a 'hierarchical z-buffer (hzb)' or 'depth-pyramid'. + * @brief retrieve internally stored 'hierarchical z-buffer (hzb)' / depth-pyramid. * - * @param context a provided vierkant::Device. + * @param context a provided gpu_cull_context_t + * @param params a provided struct with parameters + * @return a vierkant::ImagePtr containing the created depth-pyramid + */ +vierkant::ImagePtr get_depth_pyramid(const vierkant::gpu_cull_context_ptr &context); + +/** + * @brief create_depth_pyramid can be used to create a 'hierarchical z-buffer (hzb)' /depth-pyramid. + * + * @param context a provided gpu_cull_context_t * @param params a provided struct with parameters * @return a vierkant::ImagePtr containing the created depth-pyramid */ diff --git a/shaders/pbr/cull_meshlets.task b/shaders/pbr/cull_meshlets.task index 2b0ca06b..888c87ba 100644 --- a/shaders/pbr/cull_meshlets.task +++ b/shaders/pbr/cull_meshlets.task @@ -71,11 +71,11 @@ void set_visible(uint visibility_base_index, uint meshlet_index, bool visible) { if(visible) { - atomicOr(meshlet_visibilities[visibility_base_index + (meshlet_index >> 5)], 1 << (meshlet_index & 31U)); + atomicOr(meshlet_visibilities[visibility_base_index + (meshlet_index >> 5u)], 1u << (meshlet_index & 31u)); } else { - atomicAnd(meshlet_visibilities[visibility_base_index + (meshlet_index >> 5)], ~(1 << (meshlet_index & 31U))); + atomicAnd(meshlet_visibilities[visibility_base_index + (meshlet_index >> 5u)], ~(1u << (meshlet_index & 31u))); } } diff --git a/src/PBRDeferred.cpp b/src/PBRDeferred.cpp index d2ef9c52..aef35f7b 100644 --- a/src/PBRDeferred.cpp +++ b/src/PBRDeferred.cpp @@ -733,7 +733,7 @@ vierkant::Framebuffer &PBRDeferred::geometry_pass(cull_result_t &cull_result) auto &desc_depth_pyramid = drawable.descriptors[Rasterizer::BINDING_DEPTH_PYRAMID]; desc_depth_pyramid.type = VK_DESCRIPTOR_TYPE_COMBINED_IMAGE_SAMPLER; desc_depth_pyramid.stage_flags = VK_SHADER_STAGE_TASK_BIT_EXT; - desc_depth_pyramid.images = {frame_context.depth_pyramid}; + desc_depth_pyramid.images = {vierkant::get_depth_pyramid(frame_context.gpu_cull_context)}; } // check if morph-targets are available diff --git a/src/RayTracer.cpp b/src/RayTracer.cpp index 329b8ece..8a825043 100644 --- a/src/RayTracer.cpp +++ b/src/RayTracer.cpp @@ -209,12 +209,9 @@ RayTracer::create_shader_binding_table(VkPipeline pipeline, const vierkant::rayt { auto &address_region = binding_table.strided_address_region[g]; address_region.deviceAddress = binding_table.buffer->device_address() + buffer_offset; - auto data_ptr = buf_ptr + buffer_offset; - buffer_offset += address_region.size; - - memcpy(data_ptr, group_handle_data.data() + handle_size * handle_index, handle_size); - data_ptr += address_region.stride; + memcpy(buf_ptr + buffer_offset, group_handle_data.data() + handle_size * handle_index, handle_size); handle_index++; + buffer_offset += address_region.size; } binding_table.buffer->unmap(); return binding_table; diff --git a/src/gpu_culling.cpp b/src/gpu_culling.cpp index e2cd9303..790dcbfa 100644 --- a/src/gpu_culling.cpp +++ b/src/gpu_culling.cpp @@ -381,4 +381,9 @@ gpu_cull_context_ptr create_gpu_cull_context(const DevicePtr &device, const vier return ret; } +vierkant::ImagePtr get_depth_pyramid(const vierkant::gpu_cull_context_ptr &context) +{ + return context->depth_pyramid_img; +} + }// namespace vierkant \ No newline at end of file From 04228f95d5364f30b50f3149dbcbfe199d295dbc Mon Sep 17 00:00:00 2001 From: crocdialer Date: Tue, 22 Oct 2024 18:44:31 +0200 Subject: [PATCH 17/23] minor corrections for non-culling meshlet count --- shaders/pbr/cull_meshlets.task | 4 +--- 1 file changed, 1 insertion(+), 3 deletions(-) diff --git a/shaders/pbr/cull_meshlets.task b/shaders/pbr/cull_meshlets.task index 888c87ba..7653c581 100644 --- a/shaders/pbr/cull_meshlets.task +++ b/shaders/pbr/cull_meshlets.task @@ -177,9 +177,7 @@ void main() #else task_payload.meshlet_delta_indices[ti] = uint8_t(ti); - - // TODO: wonky, emitting too many meshlets in last workgroup - uint count = gl_WorkGroupSize.x; + uint count = min(draw_command.num_meshlets - gl_WorkGroupID.x * gl_WorkGroupSize.x, gl_WorkGroupSize.x); EmitMeshTasksEXT(count, 1, 1); #endif } From 9152e242e4521bd08a5ee287fdbb11ad5a374488 Mon Sep 17 00:00:00 2001 From: crocdialer Date: Wed, 23 Oct 2024 08:58:34 +0200 Subject: [PATCH 18/23] meshlet-visibility working now --- shaders/pbr/cull_meshlets.task | 12 ++++++------ src/Rasterizer.cpp | 2 +- 2 files changed, 7 insertions(+), 7 deletions(-) diff --git a/shaders/pbr/cull_meshlets.task b/shaders/pbr/cull_meshlets.task index 7653c581..07a05156 100644 --- a/shaders/pbr/cull_meshlets.task +++ b/shaders/pbr/cull_meshlets.task @@ -64,7 +64,7 @@ bool cone_cull(vec3 center, float radius, vec3 cone_axis, float cone_cutoff, vec bool is_visible(uint visibility_base_index, uint meshlet_index) { - return (meshlet_visibilities[visibility_base_index + (meshlet_index >> 5)] & (1 << (meshlet_index & 31U))) != 0; + return (meshlet_visibilities[visibility_base_index + (meshlet_index >> 5u)] & (1u << (meshlet_index & 31u))) != 0; } void set_visible(uint visibility_base_index, uint meshlet_index, bool visible) @@ -85,7 +85,7 @@ void main() { uint gid = gl_GlobalInvocationID.x; const indexed_indirect_command_t draw_command = draw_commands[context.base_draw_index + gl_DrawID]; - if(gid >= draw_command.num_meshlets){ return; } + bool valid = gid < draw_command.num_meshlets; uint object_index = draw_command.object_index; uint meshlet_base_index = gl_WorkGroupID.x * gl_WorkGroupSize.x + draw_command.base_meshlet; @@ -99,12 +99,12 @@ void main() #if CULLING - bool visible = true; - bool visible_last_frame = is_visible(draw_command.meshlet_visibility_index, mi); + bool visible = valid; + bool visible_last_frame = valid && is_visible(draw_command.meshlet_visibility_index, mi); barrier(); // occlusion-culling / visibility recording - if(post_pass) + if(post_pass && valid) { mesh_draw_t draw = draws[object_index]; @@ -164,7 +164,7 @@ void main() // post_draw visible = needs_post_draw; } - else{ visible = visible_last_frame; } + else{ visible = visible && visible_last_frame; } // determine indices/count via ballot uvec4 ballot = subgroupBallot(visible); diff --git a/src/Rasterizer.cpp b/src/Rasterizer.cpp index 79f0db16..e488caa1 100644 --- a/src/Rasterizer.cpp +++ b/src/Rasterizer.cpp @@ -377,7 +377,7 @@ void Rasterizer::render(VkCommandBuffer command_buffer, frame_assets_t &frame_as // bindless texture-array pipeline_format.descriptor_set_layouts.push_back(bindless_texture_layout.get()); - if(drawable.mesh && drawable.mesh->entries.size() < drawable.entry_index) + if(drawable.mesh && drawable.entry_index < drawable.mesh->entries.size()) { indexed_drawable.meshlet_visibility_index = meshlet_visibility_index; for(const auto &lod: drawable.mesh->entries[drawable.entry_index].lods) From 5f4435042ef5e113da8bc70052b179a96e8e6eac Mon Sep 17 00:00:00 2001 From: crocdialer Date: Wed, 23 Oct 2024 10:28:14 +0200 Subject: [PATCH 19/23] swap ballot with atomic --- shaders/pbr/cull_meshlets.task | 23 ++++++++++++++--------- 1 file changed, 14 insertions(+), 9 deletions(-) diff --git a/shaders/pbr/cull_meshlets.task b/shaders/pbr/cull_meshlets.task index 07a05156..6425da58 100644 --- a/shaders/pbr/cull_meshlets.task +++ b/shaders/pbr/cull_meshlets.task @@ -2,7 +2,7 @@ #extension GL_EXT_mesh_shader : require #extension GL_GOOGLE_include_directive : require #extension GL_EXT_buffer_reference2: require -#extension GL_KHR_shader_subgroup_ballot: require +#extension GL_EXT_null_initializer: require #include "../renderer/types.glsl" #include "../renderer/mesh_task_payload.glsl" @@ -81,6 +81,10 @@ void set_visible(uint visibility_base_index, uint meshlet_index, bool visible) #define CULLING 1 +#if CULLING +shared uint shared_count = {}; +#endif + void main() { uint gid = gl_GlobalInvocationID.x; @@ -101,7 +105,7 @@ void main() bool visible = valid; bool visible_last_frame = valid && is_visible(draw_command.meshlet_visibility_index, mi); - barrier(); +// barrier(); // occlusion-culling / visibility recording if(post_pass && valid) @@ -166,14 +170,15 @@ void main() } else{ visible = visible && visible_last_frame; } - // determine indices/count via ballot - uvec4 ballot = subgroupBallot(visible); - uint index = subgroupBallotExclusiveBitCount(ballot); - uint count = subgroupBallotBitCount(ballot); - // write payload delta-index - if (visible){ task_payload.meshlet_delta_indices[index] = uint8_t(ti); } - EmitMeshTasksEXT(count, 1, 1); + if (visible) + { + // determine indices/count via atomic + uint index = atomicAdd(shared_count, 1); + task_payload.meshlet_delta_indices[index] = uint8_t(ti); + } + barrier(); + EmitMeshTasksEXT(shared_count, 1, 1); #else task_payload.meshlet_delta_indices[ti] = uint8_t(ti); From 8b22c3cd92dddd54165409bc758ae42ad72d78df Mon Sep 17 00:00:00 2001 From: crocdialer Date: Wed, 23 Oct 2024 15:58:15 +0200 Subject: [PATCH 20/23] turn culling-preprocessor flag into constant_id --- shaders/pbr/cull_meshlets.task | 158 ++++++++++++++++----------------- src/PBRDeferred.cpp | 7 +- 2 files changed, 83 insertions(+), 82 deletions(-) diff --git a/shaders/pbr/cull_meshlets.task b/shaders/pbr/cull_meshlets.task index 6425da58..881301ef 100644 --- a/shaders/pbr/cull_meshlets.task +++ b/shaders/pbr/cull_meshlets.task @@ -10,7 +10,8 @@ #include "../utils/project_sphere.glsl" //! specialization constant for main/post pass -layout (constant_id = 2) const bool post_pass = false; +layout (constant_id = 2) const bool use_culling = false; +layout (constant_id = 3) const bool post_pass = false; layout(set = 0, binding = BINDING_DRAW_COMMANDS) readonly buffer DrawBuffer { @@ -79,11 +80,7 @@ void set_visible(uint visibility_base_index, uint meshlet_index, bool visible) } } -#define CULLING 1 - -#if CULLING shared uint shared_count = {}; -#endif void main() { @@ -101,88 +98,89 @@ void main() task_payload.object_index = object_index; task_payload.meshlet_base_index = meshlet_base_index; -#if CULLING - - bool visible = valid; - bool visible_last_frame = valid && is_visible(draw_command.meshlet_visibility_index, mi); -// barrier(); - - // occlusion-culling / visibility recording - if(post_pass && valid) + if(use_culling) { - mesh_draw_t draw = draws[object_index]; - - // transform bounding volume and normal-cone - mat4 m = camera.view * mat4_cast(draw.current_matrices.transform); - vec3 cone_axis = normalize(mat3(m) * meshlets[mi].cone_axis); - float cone_cutoff = meshlets[mi].cone_cutoff; - vec3 center = (m * vec4(meshlets[mi].sphere_center, 1.0)).xyz; - float radius = meshlets[mi].sphere_radius * max(max(abs(draw.current_matrices.transform.scale_x), - abs(draw.current_matrices.transform.scale_y)), - abs(draw.current_matrices.transform.scale_z)); - - // cone-culling - visible = materials[draw.material_index].two_sided || - !cone_cull(center, radius, cone_axis, cone_cutoff, vec3(0)); - - // frustum-culling - visible = visible && !frustum_cull(center, radius, camera.frustum); - - vec4 aabb; - bool sphere_visible = project_sphere(center, radius, camera.near, camera.projection[0][0], - camera.projection[1][1], aabb); - // cluster bound area in NDC - vec2 screen_area = aabb.zw - aabb.xy; - vec2 pyramid_size = textureSize(u_depth_pyramid, 0); - - // contribution-culling - if(visible && sphere_visible) + bool visible = valid; + bool visible_last_frame = valid && is_visible(draw_command.meshlet_visibility_index, mi); + // barrier(); + + // occlusion-culling / visibility recording + if(post_pass && valid) { - // contribution cull (based on screen-area threshold, ~1 px) - const float size_thresh = 1.0 / pyramid_size.x; - visible = max(screen_area.x, screen_area.y) >= size_thresh; + mesh_draw_t draw = draws[object_index]; + + // transform bounding volume and normal-cone + mat4 m = camera.view * mat4_cast(draw.current_matrices.transform); + vec3 cone_axis = normalize(mat3(m) * meshlets[mi].cone_axis); + float cone_cutoff = meshlets[mi].cone_cutoff; + vec3 center = (m * vec4(meshlets[mi].sphere_center, 1.0)).xyz; + float radius = meshlets[mi].sphere_radius * max(max(abs(draw.current_matrices.transform.scale_x), + abs(draw.current_matrices.transform.scale_y)), + abs(draw.current_matrices.transform.scale_z)); + + // cone-culling + visible = materials[draw.material_index].two_sided || + !cone_cull(center, radius, cone_axis, cone_cutoff, vec3(0)); + + // frustum-culling + visible = visible && !frustum_cull(center, radius, camera.frustum); + + vec4 aabb; + bool sphere_visible = project_sphere(center, radius, camera.near, camera.projection[0][0], + camera.projection[1][1], aabb); + // cluster bound area in NDC + vec2 screen_area = aabb.zw - aabb.xy; + vec2 pyramid_size = textureSize(u_depth_pyramid, 0); + + // contribution-culling + if(visible && sphere_visible) + { + // contribution cull (based on screen-area threshold, ~1 px) + const float size_thresh = 1.0 / pyramid_size.x; + visible = max(screen_area.x, screen_area.y) >= size_thresh; + } + + // occlusion-culling + if(visible && sphere_visible) + { + float width = screen_area.x * pyramid_size.x; + float height = screen_area.y * pyramid_size.y; + float level = floor(log2(max(width, height))); + + // Sampler is set up to do min reduction, so this computes the minimum depth of a 2x2 texel quad + float depth = clamp(camera.near / textureLod(u_depth_pyramid, (aabb.xy + aabb.zw) * 0.5, level).x, + camera.near, camera.far); + float depth_sphere = -center.z - radius; + + visible = depth_sphere <= depth; + } + + // became visible this frame -> not drawn in 1st-pass + bool needs_post_draw = visible && !visible_last_frame; + // if(!visible){ atomicAdd(cull_data.draw_result.v.num_occlusion_culled, 1); } + + // update visiblity + set_visible(draw_command.meshlet_visibility_index, mi, visible); + + // post_draw + visible = needs_post_draw; } + else{ visible = visible && visible_last_frame; } - // occlusion-culling - if(visible && sphere_visible) + // write payload delta-index + if (visible) { - float width = screen_area.x * pyramid_size.x; - float height = screen_area.y * pyramid_size.y; - float level = floor(log2(max(width, height))); - - // Sampler is set up to do min reduction, so this computes the minimum depth of a 2x2 texel quad - float depth = clamp(camera.near / textureLod(u_depth_pyramid, (aabb.xy + aabb.zw) * 0.5, level).x, - camera.near, camera.far); - float depth_sphere = -center.z - radius; - - visible = depth_sphere <= depth; + // determine indices/count via atomic + uint index = atomicAdd(shared_count, 1); + task_payload.meshlet_delta_indices[index] = uint8_t(ti); } - - // became visible this frame -> not drawn in 1st-pass - bool needs_post_draw = visible && !visible_last_frame; - // if(!visible){ atomicAdd(cull_data.draw_result.v.num_occlusion_culled, 1); } - - // update visiblity - set_visible(draw_command.meshlet_visibility_index, mi, visible); - - // post_draw - visible = needs_post_draw; + barrier(); + EmitMeshTasksEXT(shared_count, 1, 1); } - else{ visible = visible && visible_last_frame; } - - // write payload delta-index - if (visible) + else { - // determine indices/count via atomic - uint index = atomicAdd(shared_count, 1); - task_payload.meshlet_delta_indices[index] = uint8_t(ti); + task_payload.meshlet_delta_indices[ti] = uint8_t(ti); + uint count = min(draw_command.num_meshlets - gl_WorkGroupID.x * gl_WorkGroupSize.x, gl_WorkGroupSize.x); + EmitMeshTasksEXT(count, 1, 1); } - barrier(); - EmitMeshTasksEXT(shared_count, 1, 1); - -#else - task_payload.meshlet_delta_indices[ti] = uint8_t(ti); - uint count = min(draw_command.num_meshlets - gl_WorkGroupID.x * gl_WorkGroupSize.x, gl_WorkGroupSize.x); - EmitMeshTasksEXT(count, 1, 1); -#endif } diff --git a/src/PBRDeferred.cpp b/src/PBRDeferred.cpp index aef35f7b..aa3e1d00 100644 --- a/src/PBRDeferred.cpp +++ b/src/PBRDeferred.cpp @@ -728,6 +728,9 @@ vierkant::Framebuffer &PBRDeferred::geometry_pass(cull_result_t &cull_result) vierkant::pipeline_specialization pipeline_specialization; pipeline_specialization.set(0, mesh_shader_props.maxPreferredTaskWorkGroupInvocations); pipeline_specialization.set(1, mesh_shader_props.maxPreferredMeshWorkGroupInvocations); + + //layout (constant_id = 2) const bool use_culling + pipeline_specialization.set(2, VkBool32(use_gpu_culling)); drawable.pipeline_format.specialization = std::move(pipeline_specialization); auto &desc_depth_pyramid = drawable.descriptors[Rasterizer::BINDING_DEPTH_PYRAMID]; @@ -771,8 +774,8 @@ vierkant::Framebuffer &PBRDeferred::geometry_pass(cull_result_t &cull_result) if(drawable.descriptors.contains(Rasterizer::BINDING_DEPTH_PYRAMID) && drawable.pipeline_format.specialization) { - //layout (constant_id = 2) const bool post_pass - drawable.pipeline_format.specialization->set(2, VK_TRUE); + //layout (constant_id = 3) const bool post_pass + drawable.pipeline_format.specialization->set(3, VK_TRUE); } m_g_renderer_post.stage_drawable(drawable); } From be0b38a4130d202582149c9739e20131b0d9847d Mon Sep 17 00:00:00 2001 From: crocdialer Date: Thu, 24 Oct 2024 13:39:18 +0200 Subject: [PATCH 21/23] small hashmap excourse, cough up non-atomic flavour --- include/vierkant/hash.hpp | 10 +- include/vierkant/linear_hashmap.hpp | 196 +++++++++++++++++++++++++++- src/PBRDeferred.cpp | 1 + src/object_overlay.cpp | 2 +- tests/TestLinearHashmap.cpp | 86 ++++++++---- 5 files changed, 261 insertions(+), 34 deletions(-) diff --git a/include/vierkant/hash.hpp b/include/vierkant/hash.hpp index 0032eae6..9c4d8f92 100644 --- a/include/vierkant/hash.hpp +++ b/include/vierkant/hash.hpp @@ -4,9 +4,9 @@ #pragma once -#include -#include #include +#include +#include namespace vierkant { @@ -90,11 +90,11 @@ static inline uint32_t murmur3_32(const K &key, uint32_t seed) if constexpr(num_hashes) { - auto ptr = reinterpret_cast(&key); + auto ptr = reinterpret_cast(&key), end = ptr + num_hashes; - for(uint32_t i = num_hashes; i; i--) + for(; ptr < end; ++ptr) { - h ^= murmur_32_scramble(ptr[i - 1]); + h ^= murmur_32_scramble(*ptr); h = (h << 13) | (h >> 19); h = h * 5 + 0xe6546b64; } diff --git a/include/vierkant/linear_hashmap.hpp b/include/vierkant/linear_hashmap.hpp index 040e11ad..4453f10f 100644 --- a/include/vierkant/linear_hashmap.hpp +++ b/include/vierkant/linear_hashmap.hpp @@ -41,6 +41,198 @@ class linear_hashmap clear(); } + [[nodiscard]] inline size_t size() const { return m_num_elements; } + + [[nodiscard]] inline size_t capacity() const { return m_capacity; } + + [[nodiscard]] inline bool empty() const { return size() == 0; } + + inline void clear() + { + m_num_elements = 0; + storage_item_t *ptr = m_storage.get(), *end = ptr + m_capacity; + for(; ptr != end; ++ptr) + { + ptr->key = key_t(); + ptr->value = std::optional(); + } + } + + inline uint32_t put(const key_t &key, const value_t &value) + { + check_load_factor(); + return internal_put(key, value); + } + + [[nodiscard]] std::optional get(const key_t &key) const + { + if(!m_capacity) { return {}; } + + for(uint32_t idx = m_hash_fn(key);; idx++) + { + idx &= m_capacity - 1; + auto &item = m_storage[idx]; + if(item.key == key_t()) { return {}; } + else if(key == item.key) + { + if(item.value) { return item.value; } + } + } + } + + void remove(const key_t &key) + { + if(!m_capacity) { return; } + + for(uint32_t idx = m_hash_fn(key);; idx++) + { + idx &= m_capacity - 1; + auto &item = m_storage[idx]; + if(item.key == key_t()) { return; } + else if(key == item.key && item.value) + { + item.value = {}; + m_num_elements--; + return; + } + } + } + + [[nodiscard]] inline bool contains(const key_t &key) const { return get(key) != std::nullopt; } + + size_t get_storage(void *dst) const + { + struct output_item_t + { + key_t key = {}; + value_t value = {}; + }; + + if(dst) + { + auto output_ptr = reinterpret_cast(dst); + storage_item_t *item = m_storage.get(), *end = item + m_capacity; + for(; item != end; ++item, ++output_ptr) + { + if(item->key != key_t()) + { + output_ptr->key = item->key; + output_ptr->value = item->value ? *item->value : value_t(); + } + else { *output_ptr = {}; } + } + } + return sizeof(output_item_t) * m_capacity; + } + + void reserve(size_t new_capacity) + { + auto new_linear_hashmap = linear_hashmap(new_capacity); + storage_item_t *ptr = m_storage.get(), *end = ptr + m_capacity; + for(; ptr != end; ++ptr) + { + if(ptr->key != key_t()) + { + if(ptr->value) { new_linear_hashmap.put(ptr->key, *ptr->value); } + } + } + swap(*this, new_linear_hashmap); + } + + [[nodiscard]] float load_factor() const { return static_cast(m_num_elements) / m_capacity; } + + [[nodiscard]] float max_load_factor() const { return m_max_load_factor; } + + void max_load_factor(float load_factor) + { + m_max_load_factor = std::clamp(load_factor, 0.01f, 1.f); + check_load_factor(); + } + + friend void swap(linear_hashmap &lhs, linear_hashmap &rhs) + { + std::swap(lhs.m_capacity, rhs.m_capacity); + std::swap(lhs.m_num_elements, rhs.m_num_elements); + std::swap(lhs.m_storage, rhs.m_storage); + std::swap(lhs.m_hash_fn, rhs.m_hash_fn); + std::swap(lhs.m_max_load_factor, rhs.m_max_load_factor); + std::swap(lhs.m_grow_factor, rhs.m_grow_factor); + } + +private: + struct storage_item_t + { + key_t key; + std::optional value; + }; + + inline void check_load_factor() + { + if(m_num_elements >= m_capacity * m_max_load_factor) + { + reserve(std::max(32, static_cast(m_grow_factor * m_capacity))); + } + } + + inline uint32_t internal_put(const key_t key, const value_t &value) + { + uint32_t probe_length = 0; + + for(uint64_t idx = m_hash_fn(key);; idx++, probe_length++) + { + idx &= m_capacity - 1; + auto &item = m_storage[idx]; + + // load previous key + key_t probed_key = item.key; + + if(probed_key != key) + { + // hit another valid entry, keep probing + if(probed_key != key_t() && item.value) { continue; } + item.key = key; + m_num_elements++; + } + item.value = value; + return probe_length; + } + } + + uint64_t m_capacity = 0; + uint64_t m_num_elements = 0; + std::unique_ptr m_storage; + hash32_fn m_hash_fn = std::bind(murmur3_32, std::placeholders::_1, 0); + + // reasonably low load-factor to keep average probe-lengths low + float m_max_load_factor = 0.5f; + float m_grow_factor = 2.f; +}; + +template +class linear_hashmap_mt +{ +public: + using key_t = K; + using value_t = V; + using hash32_fn = std::function; + static_assert(std::is_default_constructible_v, "key_t not default-constructible"); + static_assert(std::equality_comparable, "key_t not comparable"); + + linear_hashmap_mt() = default; + linear_hashmap_mt(const linear_hashmap_mt &) = delete; + linear_hashmap_mt(linear_hashmap_mt &other) : linear_hashmap_mt() { swap(*this, other); }; + linear_hashmap_mt &operator=(linear_hashmap_mt other) + { + swap(*this, other); + return *this; + } + + explicit linear_hashmap_mt(uint64_t min_capacity) + : m_capacity(crocore::next_pow_2(min_capacity)), m_storage(std::make_unique(m_capacity)) + { + clear(); + } + inline size_t size() const { return m_num_elements; } inline size_t capacity() const { return m_capacity; } @@ -133,7 +325,7 @@ class linear_hashmap void reserve(size_t new_capacity) { - auto new_linear_hashmap = linear_hashmap(new_capacity); + auto new_linear_hashmap = linear_hashmap_mt(new_capacity); storage_item_t *ptr = m_storage.get(), *end = ptr + m_capacity; for(; ptr != end; ++ptr) { @@ -155,7 +347,7 @@ class linear_hashmap check_load_factor(); } - friend void swap(linear_hashmap &lhs, linear_hashmap &rhs) + friend void swap(linear_hashmap_mt &lhs, linear_hashmap_mt &rhs) { std::lock(lhs.m_mutex, rhs.m_mutex); std::unique_lock lock_lhs(lhs.m_mutex, std::adopt_lock), lock_rhs(rhs.m_mutex, std::adopt_lock); diff --git a/src/PBRDeferred.cpp b/src/PBRDeferred.cpp index aa3e1d00..4ee9d8a9 100644 --- a/src/PBRDeferred.cpp +++ b/src/PBRDeferred.cpp @@ -879,6 +879,7 @@ vierkant::Framebuffer &PBRDeferred::geometry_pass(cull_result_t &cull_result) g_buffer_semaphore_submit_info_pre.signal_value = frame_context.current_semaphore_value + (use_gpu_culling ? SemaphoreValue::G_BUFFER_LAST_VISIBLE : SemaphoreValue::G_BUFFER_ALL); + g_buffer_semaphore_submit_info_pre.signal_stage = VK_PIPELINE_STAGE_2_COLOR_ATTACHMENT_OUTPUT_BIT; frame_context.g_buffer_main.submit({cmd_buffer_pre}, m_queue, {g_buffer_semaphore_submit_info_pre}); if(use_gpu_culling) diff --git a/src/object_overlay.cpp b/src/object_overlay.cpp index 628f2bff..27d973b7 100644 --- a/src/object_overlay.cpp +++ b/src/object_overlay.cpp @@ -9,7 +9,7 @@ namespace vierkant struct object_overlay_context_t { - vierkant::linear_hashmap id_map; + vierkant::linear_hashmap_mt id_map; vierkant::BufferPtr id_map_storage_buffer; vierkant::BufferPtr param_buffer; vierkant::BufferPtr staging_buffer; diff --git a/tests/TestLinearHashmap.cpp b/tests/TestLinearHashmap.cpp index 605b9bfe..4465b440 100644 --- a/tests/TestLinearHashmap.cpp +++ b/tests/TestLinearHashmap.cpp @@ -1,20 +1,21 @@ #include #include - -TEST(linear_hashmap, empty) +template class hashmap_t> +void test_empty() { - vierkant::linear_hashmap hashmap; + hashmap_t hashmap; EXPECT_TRUE(hashmap.empty()); hashmap.clear(); EXPECT_EQ(hashmap.capacity(), 0); EXPECT_EQ(hashmap.get_storage(nullptr), 0); -} +}; -TEST(linear_hashmap, basic) +template class hashmap_t> +void test_basic() { constexpr uint32_t test_capacity = 100; - vierkant::linear_hashmap hashmap(test_capacity); + hashmap_t hashmap(test_capacity); EXPECT_TRUE(hashmap.empty()); EXPECT_GT(hashmap.get_storage(nullptr), 0); @@ -44,7 +45,8 @@ TEST(linear_hashmap, basic) hashmap.get_storage(storage.get()); } -TEST(linear_hashmap, custom_key) +template class hashmap_t> +void test_custom_key() { // custom 32-byte key struct custom_key_t @@ -60,7 +62,7 @@ TEST(linear_hashmap, custom_key) } }; constexpr uint32_t test_capacity = 100; - auto hashmap = vierkant::linear_hashmap(test_capacity); + auto hashmap = hashmap_t(test_capacity); custom_key_t k1{{1, 2, 3, 4, 5, 6, 7, 8}}; hashmap.put(k1, 69); @@ -68,25 +70,10 @@ TEST(linear_hashmap, custom_key) EXPECT_FALSE(hashmap.contains(custom_key_t())); } -TEST(linear_hashmap, reserve) -{ - vierkant::linear_hashmap hashmap; - - // fix by resizing - hashmap.reserve(17); - EXPECT_TRUE(hashmap.empty()); - hashmap.put(13, 12); - EXPECT_TRUE(hashmap.contains(13)); - - // empty / no capacity specified -> triggers internal resize - hashmap = {}; - hashmap.put(13, 12); - EXPECT_TRUE(hashmap.contains(13)); -} - -TEST(linear_hashmap, probe_length) +template class hashmap_t> +void test_probe_length() { - vierkant::linear_hashmap hashmap; + hashmap_t hashmap; // default load_factor is 0.5 EXPECT_EQ(hashmap.max_load_factor(), 0.5f); @@ -107,4 +94,51 @@ TEST(linear_hashmap, probe_length) EXPECT_LE(avg_probe_length, expected_max_avg_probe_length); EXPECT_LE(hashmap.load_factor(), 0.25f); +} + +TEST(linear_hashmap, empty) +{ + test_empty(); + test_empty(); +} + +TEST(linear_hashmap, basic) +{ + test_basic(); + test_basic(); +} + +TEST(linear_hashmap, custom_key) +{ + test_custom_key(); + test_custom_key(); +} + +template class hashmap_t> +void test_reserve() +{ + hashmap_t hashmap; + + // fix by resizing + hashmap.reserve(17); + EXPECT_TRUE(hashmap.empty()); + hashmap.put(13, 12); + EXPECT_TRUE(hashmap.contains(13)); + + // empty / no capacity specified -> triggers internal resize + hashmap = {}; + hashmap.put(13, 12); + EXPECT_TRUE(hashmap.contains(13)); +} + +TEST(linear_hashmap, reserve) +{ + test_reserve(); + test_reserve(); +} + +TEST(linear_hashmap, probe_length) +{ + test_probe_length(); + test_probe_length(); } \ No newline at end of file From 74e854b63cd65c73cac9ced090d71647a9f13a6e Mon Sep 17 00:00:00 2001 From: crocdialer Date: Thu, 24 Oct 2024 14:43:33 +0200 Subject: [PATCH 22/23] meshlets: fix annoying startup flicker-issue by pre-creating HZB --- include/vierkant/gpu_culling.hpp | 2 ++ src/PBRDeferred.cpp | 5 ++++- src/gpu_culling.cpp | 22 +++++++++++++++++++++- 3 files changed, 27 insertions(+), 2 deletions(-) diff --git a/include/vierkant/gpu_culling.hpp b/include/vierkant/gpu_culling.hpp index 47f142b5..b710d1b3 100644 --- a/include/vierkant/gpu_culling.hpp +++ b/include/vierkant/gpu_culling.hpp @@ -79,10 +79,12 @@ struct create_depth_pyramid_params_t * @brief create_gpu_cull_context is a factory to create an opaque gpu_cull_context_ptr. * * @param device a provided vierkant::Device. + * @param size context framebuffer-size * @param pipeline_cache an optional pipeline_cache. * @return an opaque pointer, owning a gpu_cull_context. */ gpu_cull_context_ptr create_gpu_cull_context(const vierkant::DevicePtr &device, + const glm::vec2 &size, const vierkant::PipelineCachePtr &pipeline_cache = nullptr); /** diff --git a/src/PBRDeferred.cpp b/src/PBRDeferred.cpp index 4ee9d8a9..26b0d8db 100644 --- a/src/PBRDeferred.cpp +++ b/src/PBRDeferred.cpp @@ -99,7 +99,6 @@ PBRDeferred::PBRDeferred(const DevicePtr &device, const create_info_t &create_in frame_context.query_pool = vierkant::create_query_pool(m_device, SemaphoreValue::MAX_VALUE * 2, VK_QUERY_TYPE_TIMESTAMP); - frame_context.gpu_cull_context = vierkant::create_gpu_cull_context(device, m_pipeline_cache); // create staging-buffers vierkant::Buffer::create_info_t staging_buffer_info = {}; @@ -1320,6 +1319,10 @@ void vierkant::PBRDeferred::resize_storage(vierkant::PBRDeferred::frame_context_ depth_fmt.usage = VK_IMAGE_USAGE_SAMPLED_BIT | VK_IMAGE_USAGE_TRANSFER_DST_BIT; frame_context.depth_map = vierkant::Image::create(m_device, depth_fmt); + // culling context, containing depth-pyramid / HZB + frame_context.gpu_cull_context = + vierkant::create_gpu_cull_context(m_device, {size.width, size.height}, m_pipeline_cache); + // init lighting framebuffer vierkant::attachment_map_t lighting_attachments; vierkant::Image::Format hdr_attachment_info = {}; diff --git a/src/gpu_culling.cpp b/src/gpu_culling.cpp index 790dcbfa..0a74fc52 100644 --- a/src/gpu_culling.cpp +++ b/src/gpu_culling.cpp @@ -312,7 +312,8 @@ draw_cull_result_t gpu_cull(const vierkant::gpu_cull_context_ptr &context, const return *reinterpret_cast(context->draw_cull_result_buffer_host->map()); } -gpu_cull_context_ptr create_gpu_cull_context(const DevicePtr &device, const vierkant::PipelineCachePtr &pipeline_cache) +gpu_cull_context_ptr create_gpu_cull_context(const DevicePtr &device, const glm::vec2 &size, + const vierkant::PipelineCachePtr &pipeline_cache) { auto ret = gpu_cull_context_ptr(new gpu_cull_context_t, std::default_delete()); ret->device = device; @@ -378,6 +379,25 @@ gpu_cull_context_ptr create_gpu_cull_context(const DevicePtr &device, const vier buffer_info.mem_usage = VMA_MEMORY_USAGE_CPU_TO_GPU; buffer_info.name = "depth_pyramid_ubo"; ret->depth_pyramid_ubo = vierkant::Buffer::create(buffer_info); + + { + VkExtent3D extent_pyramid_lvl0 = {static_cast(size.x), static_cast(size.y), 1}; + extent_pyramid_lvl0.width = crocore::next_pow_2(1 + extent_pyramid_lvl0.width / 2); + extent_pyramid_lvl0.height = crocore::next_pow_2(1 + extent_pyramid_lvl0.height / 2); + + vierkant::Image::Format depth_pyramid_fmt = {}; + depth_pyramid_fmt.extent = extent_pyramid_lvl0; + depth_pyramid_fmt.format = VK_FORMAT_R32_SFLOAT; + depth_pyramid_fmt.aspect = VK_IMAGE_ASPECT_COLOR_BIT; + depth_pyramid_fmt.usage = VK_IMAGE_USAGE_STORAGE_BIT | VK_IMAGE_USAGE_SAMPLED_BIT; + depth_pyramid_fmt.use_mipmap = true; + depth_pyramid_fmt.autogenerate_mipmaps = false; + depth_pyramid_fmt.reduction_mode = VK_SAMPLER_REDUCTION_MODE_MIN; + depth_pyramid_fmt.initial_layout = VK_IMAGE_LAYOUT_GENERAL; + // TODO: pass in cmd-buffer for layout-transition +// depth_pyramid_fmt.initial_layout_transition = false; + ret->depth_pyramid_img = vierkant::Image::create(device, depth_pyramid_fmt); + } return ret; } From ba2ba9ba108b90b11857bff9362c00ee92ca56f0 Mon Sep 17 00:00:00 2001 From: crocdialer Date: Fri, 25 Oct 2024 08:52:56 +0200 Subject: [PATCH 23/23] fixed last missing bit, late_visiblity unfucks the interaction with object&meshlet culling --- include/vierkant/Rasterizer.hpp | 2 +- shaders/pbr/cull_meshlets.task | 5 +++-- shaders/pbr/indirect_cull.comp | 1 + shaders/renderer/types.glsl | 2 +- 4 files changed, 6 insertions(+), 4 deletions(-) diff --git a/include/vierkant/Rasterizer.hpp b/include/vierkant/Rasterizer.hpp index 632b2eca..d6e61db0 100644 --- a/include/vierkant/Rasterizer.hpp +++ b/include/vierkant/Rasterizer.hpp @@ -84,13 +84,13 @@ class Rasterizer VkDrawMeshTasksIndirectCommandEXT vk_mesh_draw = {};// size: 3 uint32_t visible = false; + uint32_t late_visible = false; uint32_t object_index = 0; uint32_t base_meshlet = 0; uint32_t num_meshlets = 0; uint32_t meshlet_visibility_index = 0; uint32_t count_buffer_offset = 0; uint32_t first_draw_index = 0; - uint32_t pad[1]{}; }; struct indirect_draw_bundle_t diff --git a/shaders/pbr/cull_meshlets.task b/shaders/pbr/cull_meshlets.task index 881301ef..efb0a267 100644 --- a/shaders/pbr/cull_meshlets.task +++ b/shaders/pbr/cull_meshlets.task @@ -101,8 +101,7 @@ void main() if(use_culling) { bool visible = valid; - bool visible_last_frame = valid && is_visible(draw_command.meshlet_visibility_index, mi); - // barrier(); + bool visible_last_frame = !draw_command.late_visible && is_visible(draw_command.meshlet_visibility_index, mi); // occlusion-culling / visibility recording if(post_pass && valid) @@ -157,6 +156,8 @@ void main() // became visible this frame -> not drawn in 1st-pass bool needs_post_draw = visible && !visible_last_frame; + + // TODO: culling statistics // if(!visible){ atomicAdd(cull_data.draw_result.v.num_occlusion_culled, 1); } // update visiblity diff --git a/shaders/pbr/indirect_cull.comp b/shaders/pbr/indirect_cull.comp index 460a36a8..4277b56e 100644 --- a/shaders/pbr/indirect_cull.comp +++ b/shaders/pbr/indirect_cull.comp @@ -177,6 +177,7 @@ void main() if(needs_post_draw) { + draw.late_visible = true; uint draw_cmd_offset_post = atomicAdd(cull_data.draw_count_post.v[draw.count_buffer_offset], 1); cull_data.draws_out_post.v[draw.first_draw_index + draw_cmd_offset_post] = draw; } diff --git a/shaders/renderer/types.glsl b/shaders/renderer/types.glsl index 7094e495..3ebbafa3 100644 --- a/shaders/renderer/types.glsl +++ b/shaders/renderer/types.glsl @@ -152,13 +152,13 @@ struct indexed_indirect_command_t uint groupCountZ; bool visible; + bool late_visible; uint object_index; uint base_meshlet; uint num_meshlets; uint meshlet_visibility_index; uint count_buffer_offset; uint first_draw_index; - uint pad; }; //! meshlet parameters