diff --git a/examples/headless/src/main.rs b/examples/headless/src/main.rs index c5cb8b06b..98d00c3f8 100644 --- a/examples/headless/src/main.rs +++ b/examples/headless/src/main.rs @@ -139,6 +139,7 @@ async fn render(mut scenes: SceneSet, index: usize, args: &Args) -> Result<()> { width, height, antialiasing_method: vello::AaConfig::Area, + debug: vello::DebugLayers::none(), }; let mut scene = Scene::new(); scene.append(&fragment, Some(transform)); diff --git a/examples/simple/src/main.rs b/examples/simple/src/main.rs index 5e5c581ad..bbf721645 100644 --- a/examples/simple/src/main.rs +++ b/examples/simple/src/main.rs @@ -7,7 +7,7 @@ use std::sync::Arc; use vello::kurbo::{Affine, Circle, Ellipse, Line, RoundedRect, Stroke}; use vello::peniko::Color; use vello::util::{RenderContext, RenderSurface}; -use vello::{AaConfig, Renderer, RendererOptions, Scene}; +use vello::{AaConfig, DebugLayers, Renderer, RendererOptions, Scene}; use winit::application::ApplicationHandler; use winit::dpi::LogicalSize; use winit::event::*; @@ -151,6 +151,7 @@ impl<'s> ApplicationHandler for SimpleVelloApp<'s> { width, height, antialiasing_method: AaConfig::Msaa16, + debug: DebugLayers::none(), }, ) .expect("failed to render to surface"); diff --git a/examples/with_winit/Cargo.toml b/examples/with_winit/Cargo.toml index 1d508d4e8..783e233d2 100644 --- a/examples/with_winit/Cargo.toml +++ b/examples/with_winit/Cargo.toml @@ -25,8 +25,9 @@ workspace = true name = "with_winit_bin" path = "src/main.rs" + [dependencies] -vello = { workspace = true, features = ["buffer_labels"] } +vello = { workspace = true, features = ["buffer_labels", "debug_layers"] } scenes = { workspace = true } anyhow = { workspace = true } diff --git a/examples/with_winit/src/lib.rs b/examples/with_winit/src/lib.rs index 4c6bc97a3..0e8678149 100644 --- a/examples/with_winit/src/lib.rs +++ b/examples/with_winit/src/lib.rs @@ -162,6 +162,8 @@ struct VelloApp<'s> { prev_scene_ix: i32, modifiers: ModifiersState, + + debug: vello::DebugLayers, } impl<'s> ApplicationHandler for VelloApp<'s> { @@ -329,6 +331,27 @@ impl<'s> ApplicationHandler for VelloApp<'s> { }, ); } + debug_layer @ ("1" | "2" | "3" | "4") => { + match debug_layer { + "1" => { + self.debug.toggle(vello::DebugLayers::BOUNDING_BOXES); + } + "2" => { + self.debug + .toggle(vello::DebugLayers::LINESOUP_SEGMENTS); + } + "3" => { + self.debug.toggle(vello::DebugLayers::LINESOUP_POINTS); + } + "4" => { + self.debug.toggle(vello::DebugLayers::VALIDATION); + } + _ => unreachable!(), + } + if !self.debug.is_empty() && !self.async_pipeline { + log::warn!("Debug Layers won't work without using `--async-pipeline`. Requested {:?}", self.debug); + } + } _ => {} } } @@ -464,6 +487,7 @@ impl<'s> ApplicationHandler for VelloApp<'s> { width, height, antialiasing_method, + debug: self.debug, }; self.scene.reset(); let mut transform = self.transform; @@ -674,6 +698,8 @@ fn run( Some(render_state) }; + let debug = vello::DebugLayers::none(); + let mut app = VelloApp { context: render_cx, renderers, @@ -718,6 +744,7 @@ fn run( complexity: 0, prev_scene_ix: 0, modifiers: ModifiersState::default(), + debug, }; event_loop.run_app(&mut app).expect("run to completion"); @@ -786,6 +813,7 @@ pub fn main() -> anyhow::Result<()> { #[cfg(not(target_arch = "wasm32"))] env_logger::builder() .format_timestamp(Some(env_logger::TimestampPrecision::Millis)) + .filter_level(log::LevelFilter::Warn) .init(); let args = parse_arguments(); let scenes = args.args.select_scene_set()?; diff --git a/vello/Cargo.toml b/vello/Cargo.toml index 1fd55c6b3..83ba98d55 100644 --- a/vello/Cargo.toml +++ b/vello/Cargo.toml @@ -18,6 +18,7 @@ default = ["wgpu"] bump_estimate = ["vello_encoding/bump_estimate"] hot_reload = ["vello_shaders/compile"] buffer_labels = [] +debug_layers = [] wgpu = ["dep:wgpu"] wgpu-profiler = ["dep:wgpu-profiler"] diff --git a/vello/src/debug.rs b/vello/src/debug.rs new file mode 100644 index 000000000..5ddf89e1d --- /dev/null +++ b/vello/src/debug.rs @@ -0,0 +1,119 @@ +// Copyright 2023 the Vello Authors +// SPDX-License-Identifier: Apache-2.0 OR MIT + +#[cfg(all(feature = "debug_layers", feature = "wgpu"))] +mod renderer; +#[cfg(all(feature = "debug_layers", feature = "wgpu"))] +mod validate; + +use std::fmt::Debug; + +#[cfg(all(feature = "debug_layers", feature = "wgpu"))] +pub(crate) use renderer::*; + +/// Bitflags for enabled debug operations. +/// +/// Currently, all layers additionally require the `debug_layers` feature. +#[derive(Copy, Clone)] +pub struct DebugLayers(u8); + +impl Debug for DebugLayers { + fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result { + let mut tuple = f.debug_tuple("DebugLayers"); + if self.contains(Self::BOUNDING_BOXES) { + tuple.field(&"BOUNDING_BOXES"); + } + if self.contains(Self::LINESOUP_SEGMENTS) { + tuple.field(&"LINESOUP_SEGMENTS"); + } + if self.contains(Self::LINESOUP_POINTS) { + tuple.field(&"LINESOUP_POINTS"); + } + if self.contains(Self::VALIDATION) { + tuple.field(&"VALIDATION"); + } + + tuple.finish() + } +} + +// TODO: Currently all layers require read-back of the BumpAllocators buffer. This isn't strictly +// necessary for layers other than `VALIDATION`. The debug visualizations use the bump buffer only +// to obtain various instance counts for draws and these could instead get written out to an +// indirect draw buffer. OTOH `VALIDATION` should always require readback since we want to be able +// to run the same CPU-side tests for both CPU and GPU shaders. +impl DebugLayers { + /// Visualize the bounding box of every path. + /// Requires the `debug_layers` feature. + pub const BOUNDING_BOXES: DebugLayers = DebugLayers(1 << 0); + + /// Visualize the post-flattening line segments using line primitives. + /// Requires the `debug_layers` feature. + pub const LINESOUP_SEGMENTS: DebugLayers = DebugLayers(1 << 1); + + /// Visualize the post-flattening line endpoints. + /// Requires the `debug_layers` feature. + pub const LINESOUP_POINTS: DebugLayers = DebugLayers(1 << 2); + + /// Enable validation of internal buffer contents and visualize errors. Validation tests are + /// run on the CPU and require buffer contents to be read-back. + /// + /// Supported validation tests: + /// + /// - Watertightness: validate that every line segment within a path is connected without + /// any gaps. Line endpoints that don't precisely overlap another endpoint get visualized + /// as red circles and logged to stderr. + /// + /// Requires the `debug_layers` feature. + pub const VALIDATION: DebugLayers = DebugLayers(1 << 3); + + /// Construct a `DebugLayers` from the raw bits. + pub const fn from_bits(bits: u8) -> Self { + Self(bits) + } + + /// Get the raw representation of this value. + pub const fn bits(self) -> u8 { + self.0 + } + + /// A `DebugLayers` with no layers enabled. + pub const fn none() -> Self { + Self(0) + } + + /// A `DebugLayers` with all layers enabled. + pub const fn all() -> Self { + // Custom BitOr is not const, so need to manipulate the inner value here + Self( + Self::BOUNDING_BOXES.0 + | Self::LINESOUP_SEGMENTS.0 + | Self::LINESOUP_POINTS.0 + | Self::VALIDATION.0, + ) + } + + /// True if this `DebugLayers` has no layers enabled. + pub const fn is_empty(self) -> bool { + self.0 == 0 + } + + /// Determine whether `self` is a superset of `mask`. + pub const fn contains(self, mask: DebugLayers) -> bool { + self.0 & mask.0 == mask.0 + } + + /// Toggle the value of the layers specified in mask. + pub fn toggle(&mut self, mask: DebugLayers) { + self.0 ^= mask.0; + } +} + +/// Returns the union of the two input `DebugLayers`. +impl std::ops::BitOr for DebugLayers { + type Output = Self; + + fn bitor(self, rhs: Self) -> Self { + Self(self.0 | rhs.0) + } +} diff --git a/vello/src/debug/renderer.rs b/vello/src/debug/renderer.rs new file mode 100644 index 000000000..534607894 --- /dev/null +++ b/vello/src/debug/renderer.rs @@ -0,0 +1,497 @@ +// Copyright 2023 the Vello Authors +// SPDX-License-Identifier: Apache-2.0 OR MIT + +use super::DebugLayers; +use crate::{ + debug::validate::{validate_line_soup, LineEndpoint}, + recording::{BindType, DrawParams, ImageProxy, Recording, ResourceProxy, ShaderId}, + render::CapturedBuffers, + wgpu_engine::WgpuEngine, + DebugDownloads, RenderParams, +}; + +use { + bytemuck::{offset_of, Pod, Zeroable}, + peniko::Color, + vello_encoding::{BumpAllocators, LineSoup, PathBbox}, +}; +pub(crate) struct DebugRenderer { + // `clear_tint` slightly darkens the output from the vello renderer to make the debug overlays + // more distinguishable. + clear_tint: ShaderId, + bboxes: ShaderId, + linesoup: ShaderId, + linesoup_points: ShaderId, + unpaired_points: ShaderId, +} + +impl DebugRenderer { + pub fn new( + device: &wgpu::Device, + target_format: wgpu::TextureFormat, + engine: &mut WgpuEngine, + ) -> Self { + let module = device.create_shader_module(wgpu::ShaderModuleDescriptor { + label: Some("debug layers"), + source: wgpu::ShaderSource::Wgsl(SHADERS.into()), + }); + + let clear_tint = engine.add_render_shader( + device, + "clear-tint", + &module, + "full_screen_quad_vert", + "solid_color_frag", + wgpu::PrimitiveTopology::TriangleStrip, + wgpu::ColorTargetState { + format: target_format, + blend: Some(wgpu::BlendState { + color: wgpu::BlendComponent { + src_factor: wgpu::BlendFactor::SrcAlpha, + dst_factor: wgpu::BlendFactor::OneMinusSrcAlpha, + operation: wgpu::BlendOperation::Add, + }, + alpha: wgpu::BlendComponent::OVER, + }), + write_mask: wgpu::ColorWrites::ALL, + }, + None, + &[], + ); + let bboxes = engine.add_render_shader( + device, + "bbox-debug", + &module, + "bbox_vert", + "solid_color_frag", + wgpu::PrimitiveTopology::LineStrip, + wgpu::ColorTargetState { + format: target_format, + blend: None, + write_mask: wgpu::ColorWrites::ALL, + }, + // This mirrors the layout of the PathBbox structure. + Some(wgpu::VertexBufferLayout { + array_stride: std::mem::size_of::() as u64, + step_mode: wgpu::VertexStepMode::Instance, + attributes: &[ + wgpu::VertexAttribute { + format: wgpu::VertexFormat::Sint32x2, + offset: offset_of!(PathBbox, x0) as u64, + shader_location: 0, + }, + wgpu::VertexAttribute { + format: wgpu::VertexFormat::Sint32x2, + offset: offset_of!(PathBbox, x1) as u64, + shader_location: 1, + }, + ], + }), + &[(BindType::Uniform, wgpu::ShaderStages::VERTEX)], + ); + let linesoup = engine.add_render_shader( + device, + "linesoup-debug", + &module, + "linesoup_vert", + "solid_color_frag", + wgpu::PrimitiveTopology::TriangleStrip, + wgpu::ColorTargetState { + format: target_format, + blend: None, + write_mask: wgpu::ColorWrites::ALL, + }, + // This mirrors the layout of the LineSoup structure. + Some(wgpu::VertexBufferLayout { + array_stride: std::mem::size_of::() as u64, + step_mode: wgpu::VertexStepMode::Instance, + attributes: &[ + wgpu::VertexAttribute { + format: wgpu::VertexFormat::Float32x2, + offset: offset_of!(LineSoup, p0) as u64, + shader_location: 0, + }, + wgpu::VertexAttribute { + format: wgpu::VertexFormat::Float32x2, + offset: offset_of!(LineSoup, p1) as u64, + shader_location: 1, + }, + ], + }), + &[(BindType::Uniform, wgpu::ShaderStages::VERTEX)], + ); + let linesoup_points = engine.add_render_shader( + device, + "linepoints-debug", + &module, + "linepoints_vert", + "sdf_circle_frag", + wgpu::PrimitiveTopology::TriangleStrip, + wgpu::ColorTargetState { + format: target_format, + blend: Some(wgpu::BlendState { + color: wgpu::BlendComponent { + src_factor: wgpu::BlendFactor::SrcAlpha, + dst_factor: wgpu::BlendFactor::OneMinusSrcAlpha, + operation: wgpu::BlendOperation::Add, + }, + alpha: wgpu::BlendComponent::OVER, + }), + write_mask: wgpu::ColorWrites::ALL, + }, + // This mirrors the layout of the LineSoup structure. The pipeline only processes the + // first point of each line. Since all points should be paired, this is enough to + // render all points. All unpaired points alone get drawn by the `unpaired_points` + // pipeline, so no point should get missed. + Some(wgpu::VertexBufferLayout { + array_stride: std::mem::size_of::() as u64, + step_mode: wgpu::VertexStepMode::Instance, + attributes: &[wgpu::VertexAttribute { + format: wgpu::VertexFormat::Float32x2, + offset: offset_of!(LineSoup, p0) as u64, + shader_location: 0, + }], + }), + &[ + (BindType::Uniform, wgpu::ShaderStages::VERTEX), + ( + BindType::Uniform, + wgpu::ShaderStages::VERTEX | wgpu::ShaderStages::FRAGMENT, + ), + ], + ); + let unpaired_points = engine.add_render_shader( + device, + "linepoints-debug", + &module, + "linepoints_vert", + "sdf_circle_frag", + wgpu::PrimitiveTopology::TriangleStrip, + wgpu::ColorTargetState { + format: target_format, + blend: Some(wgpu::BlendState { + color: wgpu::BlendComponent { + src_factor: wgpu::BlendFactor::SrcAlpha, + dst_factor: wgpu::BlendFactor::OneMinusSrcAlpha, + operation: wgpu::BlendOperation::Add, + }, + alpha: wgpu::BlendComponent::OVER, + }), + write_mask: wgpu::ColorWrites::ALL, + }, + // This mirrors the layout of the LineSoup structure. + Some(wgpu::VertexBufferLayout { + array_stride: std::mem::size_of::() as u64, + step_mode: wgpu::VertexStepMode::Instance, + attributes: &[wgpu::VertexAttribute { + format: wgpu::VertexFormat::Float32x2, + offset: offset_of!(LineEndpoint, x) as u64, + shader_location: 0, + }], + }), + &[ + (BindType::Uniform, wgpu::ShaderStages::VERTEX), + ( + BindType::Uniform, + wgpu::ShaderStages::VERTEX | wgpu::ShaderStages::FRAGMENT, + ), + ], + ); + + Self { + clear_tint, + bboxes, + linesoup, + linesoup_points, + unpaired_points, + } + } + + pub fn render( + &self, + recording: &mut Recording, + target: ImageProxy, + captured: &CapturedBuffers, + bump: &BumpAllocators, + params: &RenderParams, + downloads: &DebugDownloads, + ) { + if params.debug.is_empty() { + return; + } + + let (unpaired_pts_len, unpaired_pts_buf) = if params.debug.contains(DebugLayers::VALIDATION) + { + // TODO: have this write directly to a GPU buffer? + let unpaired_pts: Vec = + validate_line_soup(bytemuck::cast_slice(&downloads.lines.get_mapped_range())); + if unpaired_pts.is_empty() { + (0, None) + } else { + ( + unpaired_pts.len(), + Some( + recording + .upload("unpaired points", bytemuck::cast_slice(&unpaired_pts[..])), + ), + ) + } + } else { + (0, None) + }; + + let uniforms = Uniforms { + width: params.width, + height: params.height, + }; + let uniforms_buf = ResourceProxy::Buffer( + recording.upload_uniform("uniforms", bytemuck::bytes_of(&uniforms)), + ); + + let linepoints_uniforms = [ + LinepointsUniforms::new(Color::DARK_CYAN, 10.), + LinepointsUniforms::new(Color::RED, 80.), + ]; + let linepoints_uniforms_buf = recording.upload_uniform( + "linepoints uniforms", + bytemuck::bytes_of(&linepoints_uniforms), + ); + + recording.draw(DrawParams { + shader_id: self.clear_tint, + instance_count: 1, + vertex_count: 4, + vertex_buffer: None, + resources: vec![], + target, + clear_color: None, + }); + if params.debug.contains(DebugLayers::BOUNDING_BOXES) { + recording.draw(DrawParams { + shader_id: self.bboxes, + instance_count: captured.sizes.path_bboxes.len(), + vertex_count: 5, + vertex_buffer: Some(captured.path_bboxes), + resources: vec![uniforms_buf], + target, + clear_color: None, + }); + } + if params.debug.contains(DebugLayers::LINESOUP_SEGMENTS) { + recording.draw(DrawParams { + shader_id: self.linesoup, + instance_count: bump.lines, + vertex_count: 4, + vertex_buffer: Some(captured.lines), + resources: vec![uniforms_buf], + target, + clear_color: None, + }); + } + if params.debug.contains(DebugLayers::LINESOUP_POINTS) { + recording.draw(DrawParams { + shader_id: self.linesoup_points, + instance_count: bump.lines, + vertex_count: 4, + vertex_buffer: Some(captured.lines), + resources: vec![ + uniforms_buf, + ResourceProxy::BufferRange { + proxy: linepoints_uniforms_buf, + offset: 0, + size: std::mem::size_of::() as u64, + }, + ], + target, + clear_color: None, + }); + } + if let Some(unpaired_pts_buf) = unpaired_pts_buf { + recording.draw(DrawParams { + shader_id: self.unpaired_points, + instance_count: unpaired_pts_len.try_into().unwrap(), + vertex_count: 4, + vertex_buffer: Some(unpaired_pts_buf), + resources: vec![ + uniforms_buf, + ResourceProxy::BufferRange { + proxy: linepoints_uniforms_buf, + offset: std::mem::size_of::() as u64, + size: std::mem::size_of::() as u64, + }, + ], + target, + clear_color: None, + }); + recording.free_buffer(unpaired_pts_buf); + } + + recording.free_resource(uniforms_buf); + recording.free_buffer(linepoints_uniforms_buf); + } +} + +#[derive(Copy, Clone, Zeroable, Pod)] +#[repr(C)] +struct Uniforms { + width: u32, + height: u32, +} + +#[derive(Copy, Clone, Zeroable, Pod)] +#[repr(C)] +struct LinepointsUniforms { + point_color: [f32; 3], + point_size: f32, + // Uniform parameters for individual SDF point draws are stored in a single buffer. + // This 240 byte padding is here to bring the element offset alignment of 256 bytes. + // (see https://www.w3.org/TR/webgpu/#dom-supported-limits-minuniformbufferoffsetalignment) + _pad0: [u32; 30], + _pad1: [u32; 30], +} + +impl LinepointsUniforms { + fn new(color: Color, point_size: f32) -> Self { + Self { + point_color: [ + color.r as f32 / 255., + color.g as f32 / 255., + color.b as f32 / 255., + ], + point_size, + _pad0: [0; 30], + _pad1: [0; 30], + } + } +} + +const SHADERS: &str = r#" + +// Map from y-down normalized coordinates to NDC: +fn map_to_ndc(p: vec2f) -> vec4f { + return vec4(vec2(1., -1.) * (2. * p - vec2(1.)), 0., 1.); +} + +alias QuadVertices = array; +var quad_vertices: QuadVertices = QuadVertices( + vec2(0., 1.), + vec2(0., 0.), + vec2(1., 0.), + vec2(1., 1.), +); + +var quad_fill_indices: array = array(0u, 3u, 1u, 2u); + +struct Uniforms { + width: u32, + height: u32, +} +@binding(0) @group(0) var uniforms: Uniforms; + +struct VSOut { + @builtin(position) pos: vec4f, + @location(0) color: vec4f, +} + +//////////// + +@vertex +fn full_screen_quad_vert(@builtin(vertex_index) vid: u32) -> VSOut { + let p = quad_vertices[quad_fill_indices[vid]]; + // TODO: Make the alpha configurable here. + // The clear tint is a full-screen layer above the entire image with this color. + return VSOut(map_to_ndc(p), vec4(0., 0., 0., 0.2)); +} + +//////////// + +struct BboxIn { + @location(0) p0: vec2i, + @location(1) p1: vec2i, +} + +@vertex +fn bbox_vert(@builtin(vertex_index) vid: u32, bbox: BboxIn) -> VSOut { + let ul = vec2f(f32(bbox.p0.x), f32(bbox.p0.y)); + let br = vec2f(f32(bbox.p1.x), f32(bbox.p1.y)); + let dim = br - ul; + let p = (ul + dim * quad_vertices[vid % 4u]) / vec2f(f32(uniforms.width), f32(uniforms.height)); + return VSOut(map_to_ndc(p), vec4(0., 1., 0., 1.)); +} + +//////////// + +struct LinesoupIn { + @location(0) p0: vec2f, + @location(1) p1: vec2f, +} + +const LINE_THICKNESS: f32 = 4.; +const WIND_DOWN_COLOR: vec3f = vec3(0., 1., 0.); +const WIND_UP_COLOR: vec3f = vec3(1., 0., 0.); + +@vertex +fn linesoup_vert(@builtin(vertex_index) vid: u32, line: LinesoupIn) -> VSOut { + let quad_corner = quad_vertices[quad_fill_indices[vid]] - vec2(0.5); + let v = line.p1 - line.p0; + let m = mix(line.p0, line.p1, 0.5); + let s = vec2(LINE_THICKNESS, length(v)); + let vn = normalize(v); + let r = mat2x2(vn.y, -vn.x, vn.x, vn.y); + let p = (m + r * (s * quad_corner)) / vec2f(f32(uniforms.width), f32(uniforms.height)); + //let color = vec4(0.7, 0.5, 0., 1.); + let color = vec4(select(WIND_UP_COLOR, WIND_DOWN_COLOR, v.y >= 0.), 1.); + return VSOut(map_to_ndc(p), color); +} + +//////////// + +struct LinepointsUniforms { + point_color: vec3f, + point_size: f32, +} +@binding(1) @group(0) var linepoints_uniforms: LinepointsUniforms; + +struct SDFCircleOut { + @builtin(position) pos: vec4f, + + // Unpremultiplied color of the circle. + @location(0) color: vec3f, + + // The 2D position of the pixel fragment relative to the center of the quad. The quad edges + // are at coordinates (±1, 0) and (0, ±1). + @location(1) quad_relative: vec2f, +} + +@vertex +fn linepoints_vert(@builtin(vertex_index) vid: u32, @location(0) point: vec2f) -> SDFCircleOut { + let quad_corner = quad_vertices[quad_fill_indices[vid]] - vec2(0.5); + let rect_dim = vec2(linepoints_uniforms.point_size); + let p = (point + rect_dim * quad_corner) / vec2(f32(uniforms.width), f32(uniforms.height)); + + return SDFCircleOut( + map_to_ndc(p), + linepoints_uniforms.point_color, + // Normalize the corners of the quad such that they form a vector of length √2. This should + // align the edge fragments to ±1. The post-interpolation values of `quad_relative` will + // then form a distance field that can represent a circle of radius 1 within the quad + // (where the distance is relative to the center of the circle). + normalize(quad_corner) * sqrt(2.), + ); +} + +@fragment +fn solid_color_frag(in: VSOut) -> @location(0) vec4f { + return in.color; +} + +@fragment +fn sdf_circle_frag(in: SDFCircleOut) -> @location(0) vec4f { + // Draw an antialiased circle with a fading margin as a visual effect. `THRESHOLD` is the + // distance from the center of the circle to the edge where the fade begins. + let THRESHOLD = 0.6; + let d = saturate(length(in.quad_relative)); + let alpha = select(1., 1. - smoothstep(THRESHOLD, 1., d), d > THRESHOLD); + return vec4(in.color.rgb, alpha); +} +"#; diff --git a/vello/src/debug/validate.rs b/vello/src/debug/validate.rs new file mode 100644 index 000000000..d3bd6e5d3 --- /dev/null +++ b/vello/src/debug/validate.rs @@ -0,0 +1,64 @@ +// Copyright 2023 the Vello Authors +// SPDX-License-Identifier: Apache-2.0 OR MIT + +use { + bytemuck::{Pod, Zeroable}, + std::{collections::BTreeSet, fmt}, + vello_encoding::LineSoup, +}; + +#[derive(Copy, Clone, Default, PartialEq, Eq, PartialOrd, Ord, Pod, Zeroable)] +#[repr(C)] +pub struct LineEndpoint { + pub path_ix: u32, + + // Coordinates in IEEE-754 32-bit float representation + // We use u32 here because we are comparing bit patterns rather than proximity, to evaluate exact watertightness + // To accelerate this, we use a BTreeSet, which don't support f32 values directly. + pub x: u32, + pub y: u32, +} + +impl LineEndpoint { + pub fn new(line: &LineSoup, start_or_end: bool) -> Self { + let (x, y) = if start_or_end { + (line.p0[0], line.p0[1]) + } else { + (line.p1[0], line.p1[1]) + }; + Self { + path_ix: line.path_ix, + x: x.to_bits(), + y: y.to_bits(), + } + } +} + +impl fmt::Debug for LineEndpoint { + fn fmt(&self, f: &mut fmt::Formatter<'_>) -> fmt::Result { + f.debug_struct("Point") + .field("path_ix", &self.path_ix) + .field("x", &f32::from_bits(self.x)) + .field("y", &f32::from_bits(self.y)) + .finish() + } +} + +pub(crate) fn validate_line_soup(lines: &[LineSoup]) -> Vec { + let mut points = BTreeSet::new(); + for line in lines { + let pts = [ + LineEndpoint::new(line, true), + LineEndpoint::new(line, false), + ]; + for p in pts { + if !points.remove(&p) { + points.insert(p); + } + } + } + if !points.is_empty() { + eprintln!("Unpaired points are present: {:#?}", points); + } + points.into_iter().collect() +} diff --git a/vello/src/lib.rs b/vello/src/lib.rs index a8e22f53d..4aa072381 100644 --- a/vello/src/lib.rs +++ b/vello/src/lib.rs @@ -81,6 +81,7 @@ //! //! See the [`examples/`](https://github.com/linebender/vello/tree/main/examples) folder to see how that code integrates with frameworks like winit. +mod debug; mod recording; mod render; mod scene; @@ -124,10 +125,11 @@ use vello_encoding::Resolver; #[cfg(feature = "wgpu")] use wgpu_engine::{ExternalResource, WgpuEngine}; +pub use debug::DebugLayers; /// Temporary export, used in `with_winit` for stats pub use vello_encoding::BumpAllocators; #[cfg(feature = "wgpu")] -use wgpu::{Device, PipelineCompilationOptions, Queue, SurfaceTexture, TextureFormat, TextureView}; +use wgpu::{Device, Queue, SurfaceTexture, TextureFormat, TextureView}; #[cfg(all(feature = "wgpu", feature = "wgpu-profiler"))] use wgpu_profiler::{GpuProfiler, GpuProfilerSettings}; @@ -212,6 +214,11 @@ pub enum Error { #[cfg(feature = "wgpu")] #[error("Failed to async map a buffer")] BufferAsyncError(#[from] wgpu::BufferAsyncError), + /// Failed to download an internal buffer for debug visualization. + #[cfg(feature = "wgpu")] + #[cfg(feature = "debug_layers")] + #[error("Failed to download internal buffer '{0}' for visualization")] + DownloadError(&'static str), #[cfg(feature = "wgpu")] #[error("wgpu Error from scope")] @@ -241,6 +248,8 @@ pub struct Renderer { resolver: Resolver, shaders: FullShaders, blit: Option, + #[cfg(feature = "debug_layers")] + debug: Option, target: Option, #[cfg(feature = "wgpu-profiler")] pub profiler: GpuProfiler, @@ -268,6 +277,13 @@ pub struct RenderParams { /// The anti-aliasing algorithm. The selected algorithm must have been initialized while /// constructing the `Renderer`. pub antialiasing_method: AaConfig, + + /// Options for debug layer rendering. + /// + /// This only has an effect when the `debug_layers` feature is enabled. + // This is exposed publicly as a least-effort to avoid changing the API when features change. + // We expect the API to change here in the near future. + pub debug: DebugLayers, } #[cfg(feature = "wgpu")] @@ -296,6 +312,13 @@ pub struct RendererOptions { pub num_init_threads: Option, } +#[cfg(feature = "wgpu")] +struct RenderResult { + bump: Option, + #[cfg(feature = "debug_layers")] + captured: Option, +} + #[cfg(feature = "wgpu")] impl Renderer { /// Creates a new renderer for the specified device. @@ -311,7 +334,11 @@ impl Renderer { engine.build_shaders_if_needed(device, options.num_init_threads); let blit = options .surface_format - .map(|surface_format| BlitPipeline::new(device, surface_format)); + .map(|surface_format| BlitPipeline::new(device, surface_format, &mut engine)); + #[cfg(feature = "debug_layers")] + let debug = options + .surface_format + .map(|surface_format| debug::DebugRenderer::new(device, surface_format, &mut engine)); Ok(Self { options, @@ -319,6 +346,8 @@ impl Renderer { resolver: Resolver::new(), shaders, blit, + #[cfg(feature = "debug_layers")] + debug, target: None, // Use 3 pending frames #[cfg(feature = "wgpu-profiler")] @@ -407,45 +436,39 @@ impl Renderer { .blit .as_ref() .expect("renderer should have configured surface_format to use on a surface"); - let mut encoder = - device.create_command_encoder(&wgpu::CommandEncoderDescriptor { label: None }); - { - let surface_view = surface - .texture - .create_view(&wgpu::TextureViewDescriptor::default()); - let bind_group = device.create_bind_group(&wgpu::BindGroupDescriptor { - label: None, - layout: &blit.bind_layout, - entries: &[wgpu::BindGroupEntry { - binding: 0, - resource: wgpu::BindingResource::TextureView(&target.view), - }], - }); - let mut render_pass = encoder.begin_render_pass(&wgpu::RenderPassDescriptor { - label: None, - color_attachments: &[Some(wgpu::RenderPassColorAttachment { - view: &surface_view, - resolve_target: None, - ops: wgpu::Operations { - load: wgpu::LoadOp::Clear(wgpu::Color::default()), - store: wgpu::StoreOp::Store, - }, - })], - depth_stencil_attachment: None, - occlusion_query_set: None, - timestamp_writes: None, - }); + let mut recording = Recording::default(); + let target_proxy = ImageProxy::new(width, height, ImageFormat::from_wgpu(target.format)); + let surface_proxy = ImageProxy::new( + width, + height, + ImageFormat::from_wgpu(surface.texture.format()), + ); + recording.draw(recording::DrawParams { + shader_id: blit.0, + instance_count: 1, + vertex_count: 6, + vertex_buffer: None, + resources: vec![ResourceProxy::Image(target_proxy)], + target: surface_proxy, + clear_color: Some([0., 0., 0., 0.]), + }); + + let surface_view = surface + .texture + .create_view(&wgpu::TextureViewDescriptor::default()); + let external_resources = [ + ExternalResource::Image(target_proxy, &target.view), + ExternalResource::Image(surface_proxy, &surface_view), + ]; + self.engine.run_recording( + device, + queue, + &recording, + &external_resources, + "blit (render_to_surface)", #[cfg(feature = "wgpu-profiler")] - let mut render_pass = self - .profiler - .scope("blit to surface", &mut render_pass, device); - render_pass.set_pipeline(&blit.pipeline); - render_pass.set_bind_group(0, &bind_group, &[]); - render_pass.draw(0..6, 0..1); - } - #[cfg(feature = "wgpu-profiler")] - self.profiler.resolve_queries(&mut encoder); - queue.submit(Some(encoder.finish())); + &mut self.profiler, + )?; self.target = Some(target); #[cfg(feature = "wgpu-profiler")] { @@ -467,12 +490,26 @@ impl Renderer { let mut engine = WgpuEngine::new(self.options.use_cpu); // We choose not to initialise these shaders in parallel, to ensure the error scope works correctly let shaders = shaders::full_shaders(device, &mut engine, &self.options)?; + let blit = self + .options + .surface_format + .map(|surface_format| BlitPipeline::new(device, surface_format, &mut engine)); + #[cfg(feature = "debug_layers")] + let debug = self + .options + .surface_format + .map(|format| debug::DebugRenderer::new(device, format, &mut engine)); let error = device.pop_error_scope().await; if let Some(error) = error { return Err(error.into()); } self.engine = engine; self.shaders = shaders; + self.blit = blit; + #[cfg(feature = "debug_layers")] + { + self.debug = debug; + } Ok(()) } @@ -495,10 +532,46 @@ impl Renderer { texture: &TextureView, params: &RenderParams, ) -> Result> { + let result = self + .render_to_texture_async_internal(device, queue, scene, texture, params) + .await?; + #[cfg(feature = "debug_layers")] + { + // TODO: it would be better to improve buffer ownership tracking so that it's not + // necessary to submit a whole new Recording to free the captured buffers. + if let Some(captured) = result.captured { + let mut recording = Recording::default(); + // TODO: this sucks. better to release everything in a helper + self.engine.free_download(captured.lines); + captured.release_buffers(&mut recording); + self.engine.run_recording( + device, + queue, + &recording, + &[], + "free memory", + #[cfg(feature = "wgpu-profiler")] + &mut self.profiler, + )?; + } + } + Ok(result.bump) + } + + async fn render_to_texture_async_internal( + &mut self, + device: &Device, + queue: &Queue, + scene: &Scene, + texture: &TextureView, + params: &RenderParams, + ) -> Result { let mut render = Render::new(); let encoding = scene.encoding(); - // TODO: turn this on; the download feature interacts with CPU dispatch - let robust = false; + // TODO: turn this on; the download feature interacts with CPU dispatch. + // Currently this is always enabled when the `debug_layers` setting is enabled as the bump + // counts are used for debug visualiation. + let robust = cfg!(feature = "debug_layers"); let recording = render.render_encoding_coarse( encoding, &mut self.resolver, @@ -508,6 +581,8 @@ impl Renderer { ); let target = render.out_image(); let bump_buf = render.bump_buf(); + #[cfg(feature = "debug_layers")] + let captured = render.take_captured_buffers(); self.engine.run_recording( device, queue, @@ -543,7 +618,11 @@ impl Renderer { #[cfg(feature = "wgpu-profiler")] &mut self.profiler, )?; - Ok(bump) + Ok(RenderResult { + bump, + #[cfg(feature = "debug_layers")] + captured, + }) } /// See [`Self::render_to_surface`] @@ -566,53 +645,73 @@ impl Renderer { if target.width != width || target.height != height { target = TargetTexture::new(device, width, height); } - let bump = self - .render_to_texture_async(device, queue, scene, &target.view, params) + let result = self + .render_to_texture_async_internal(device, queue, scene, &target.view, params) .await?; let blit = self .blit .as_ref() .expect("renderer should have configured surface_format to use on a surface"); - let mut encoder = - device.create_command_encoder(&wgpu::CommandEncoderDescriptor { label: None }); + let mut recording = Recording::default(); + let target_proxy = ImageProxy::new(width, height, ImageFormat::from_wgpu(target.format)); + let surface_proxy = ImageProxy::new( + width, + height, + ImageFormat::from_wgpu(surface.texture.format()), + ); + recording.draw(recording::DrawParams { + shader_id: blit.0, + instance_count: 1, + vertex_count: 6, + vertex_buffer: None, + resources: vec![ResourceProxy::Image(target_proxy)], + target: surface_proxy, + clear_color: Some([0., 0., 0., 0.]), + }); + + #[cfg(feature = "debug_layers")] { - let surface_view = surface - .texture - .create_view(&wgpu::TextureViewDescriptor::default()); - let bind_group = device.create_bind_group(&wgpu::BindGroupDescriptor { - label: None, - layout: &blit.bind_layout, - entries: &[wgpu::BindGroupEntry { - binding: 0, - resource: wgpu::BindingResource::TextureView(&target.view), - }], - }); - let mut render_pass = encoder.begin_render_pass(&wgpu::RenderPassDescriptor { - label: None, - color_attachments: &[Some(wgpu::RenderPassColorAttachment { - view: &surface_view, - resolve_target: None, - ops: wgpu::Operations { - load: wgpu::LoadOp::Clear(wgpu::Color::default()), - store: wgpu::StoreOp::Store, - }, - })], - depth_stencil_attachment: None, - timestamp_writes: None, - occlusion_query_set: None, - }); - #[cfg(feature = "wgpu-profiler")] - let mut render_pass = self - .profiler - .scope("blit to surface", &mut render_pass, device); - render_pass.set_pipeline(&blit.pipeline); - render_pass.set_bind_group(0, &bind_group, &[]); - render_pass.draw(0..6, 0..1); + if let Some(captured) = result.captured { + let debug = self + .debug + .as_ref() + .expect("renderer should have configured surface_format to use on a surface"); + let bump = result.bump.as_ref().unwrap(); + // TODO: We could avoid this download if `DebugLayers::VALIDATION` is unset. + let downloads = DebugDownloads::map(&self.engine, &captured, bump).await?; + debug.render( + &mut recording, + surface_proxy, + &captured, + bump, + params, + &downloads, + ); + + // TODO: this sucks. better to release everything in a helper + // TODO: it would be much better to have a way to safely destroy a buffer. + self.engine.free_download(captured.lines); + captured.release_buffers(&mut recording); + } } - #[cfg(feature = "wgpu-profiler")] - self.profiler.resolve_queries(&mut encoder); - queue.submit(Some(encoder.finish())); - self.target = Some(target); + + let surface_view = surface + .texture + .create_view(&wgpu::TextureViewDescriptor::default()); + let external_resources = [ + ExternalResource::Image(target_proxy, &target.view), + ExternalResource::Image(surface_proxy, &surface_view), + ]; + self.engine.run_recording( + device, + queue, + &recording, + &external_resources, + "blit (render_to_surface_async)", + #[cfg(feature = "wgpu-profiler")] + &mut self.profiler, + )?; + #[cfg(feature = "wgpu-profiler")] { self.profiler.end_frame().unwrap(); @@ -623,7 +722,9 @@ impl Renderer { self.profile_result = Some(result); } } - Ok(bump) + + self.target = Some(target); + Ok(result.bump) } } @@ -632,11 +733,13 @@ struct TargetTexture { view: TextureView, width: u32, height: u32, + format: wgpu::TextureFormat, } #[cfg(feature = "wgpu")] impl TargetTexture { fn new(device: &Device, width: u32, height: u32) -> Self { + let format = wgpu::TextureFormat::Rgba8Unorm; let texture = device.create_texture(&wgpu::TextureDescriptor { label: None, size: wgpu::Extent3d { @@ -648,7 +751,7 @@ impl TargetTexture { sample_count: 1, dimension: wgpu::TextureDimension::D2, usage: wgpu::TextureUsages::STORAGE_BINDING | wgpu::TextureUsages::TEXTURE_BINDING, - format: wgpu::TextureFormat::Rgba8Unorm, + format, view_formats: &[], }); let view = texture.create_view(&wgpu::TextureViewDescriptor::default()); @@ -656,19 +759,17 @@ impl TargetTexture { view, width, height, + format, } } } #[cfg(feature = "wgpu")] -struct BlitPipeline { - bind_layout: wgpu::BindGroupLayout, - pipeline: wgpu::RenderPipeline, -} +struct BlitPipeline(ShaderId); #[cfg(feature = "wgpu")] impl BlitPipeline { - fn new(device: &Device, format: TextureFormat) -> Self { + fn new(device: &Device, format: TextureFormat, engine: &mut WgpuEngine) -> Self { const SHADERS: &str = r#" @vertex fn vs_main(@builtin(vertex_index) ix: u32) -> @builtin(position) vec4 { @@ -698,69 +799,54 @@ impl BlitPipeline { return vec4(rgba_sep.rgb * rgba_sep.a, rgba_sep.a); } "#; - - let shader = device.create_shader_module(wgpu::ShaderModuleDescriptor { + let module = device.create_shader_module(wgpu::ShaderModuleDescriptor { label: Some("blit shaders"), source: wgpu::ShaderSource::Wgsl(SHADERS.into()), }); - let bind_layout = device.create_bind_group_layout(&wgpu::BindGroupLayoutDescriptor { - label: None, - entries: &[wgpu::BindGroupLayoutEntry { - visibility: wgpu::ShaderStages::FRAGMENT, - binding: 0, - ty: wgpu::BindingType::Texture { - sample_type: wgpu::TextureSampleType::Float { filterable: true }, - view_dimension: wgpu::TextureViewDimension::D2, - multisampled: false, - }, - count: None, - }], - }); - let pipeline_layout = device.create_pipeline_layout(&wgpu::PipelineLayoutDescriptor { - label: None, - bind_group_layouts: &[&bind_layout], - push_constant_ranges: &[], - }); - let pipeline = device.create_render_pipeline(&wgpu::RenderPipelineDescriptor { - label: None, - layout: Some(&pipeline_layout), - vertex: wgpu::VertexState { - module: &shader, - entry_point: "vs_main", - compilation_options: PipelineCompilationOptions::default(), - buffers: &[], - }, - fragment: Some(wgpu::FragmentState { - module: &shader, - entry_point: "fs_main", - compilation_options: PipelineCompilationOptions::default(), - targets: &[Some(wgpu::ColorTargetState { - format, - blend: None, - write_mask: wgpu::ColorWrites::ALL, - })], - }), - primitive: wgpu::PrimitiveState { - topology: wgpu::PrimitiveTopology::TriangleList, - strip_index_format: None, - front_face: wgpu::FrontFace::Ccw, - cull_mode: Some(wgpu::Face::Back), - polygon_mode: wgpu::PolygonMode::Fill, - unclipped_depth: false, - conservative: false, - }, - depth_stencil: None, - multisample: wgpu::MultisampleState { - count: 1, - mask: !0, - alpha_to_coverage_enabled: false, + let shader_id = engine.add_render_shader( + device, + "blit", + &module, + "vs_main", + "fs_main", + wgpu::PrimitiveTopology::TriangleList, + wgpu::ColorTargetState { + format, + blend: None, + write_mask: wgpu::ColorWrites::ALL, }, - multiview: None, - cache: None, - }); - Self { - bind_layout, - pipeline, - } + None, + &[( + BindType::ImageRead(ImageFormat::from_wgpu(format)), + wgpu::ShaderStages::FRAGMENT, + )], + ); + Self(shader_id) + } +} + +#[cfg(all(feature = "debug_layers", feature = "wgpu"))] +pub(crate) struct DebugDownloads<'a> { + pub lines: wgpu::BufferSlice<'a>, +} + +#[cfg(all(feature = "debug_layers", feature = "wgpu"))] +impl<'a> DebugDownloads<'a> { + pub async fn map( + engine: &'a WgpuEngine, + captured: &render::CapturedBuffers, + bump: &BumpAllocators, + ) -> Result> { + use vello_encoding::LineSoup; + + let Some(lines_buf) = engine.get_download(captured.lines) else { + return Err(Error::DownloadError("linesoup")); + }; + + let lines = lines_buf.slice(..bump.lines as u64 * std::mem::size_of::() as u64); + let (sender, receiver) = futures_intrusive::channel::shared::oneshot_channel(); + lines.map_async(wgpu::MapMode::Read, move |v| sender.send(v).unwrap()); + receiver.receive().await.expect("channel was closed")?; + Ok(Self { lines }) } } diff --git a/vello/src/recording.rs b/vello/src/recording.rs index e3c984006..503d65cdb 100644 --- a/vello/src/recording.rs +++ b/vello/src/recording.rs @@ -52,6 +52,11 @@ pub struct ImageProxy { #[derive(Clone, Copy)] pub enum ResourceProxy { Buffer(BufferProxy), + BufferRange { + proxy: BufferProxy, + offset: u64, + size: u64, + }, Image(ImageProxy), } @@ -64,11 +69,6 @@ pub enum Command { /// Commands the data to be uploaded to the given image. UploadImage(ImageProxy, Vec), WriteImage(ImageProxy, [u32; 2], Image), - // Discussion question: third argument is vec of resources? - // Maybe use tricks to make more ergonomic? - // Alternative: provide bufs & images as separate sequences - Dispatch(ShaderId, (u32, u32, u32), Vec), - DispatchIndirect(ShaderId, BufferProxy, u64, Vec), Download(BufferProxy), /// Commands to clear the buffer from an offset on for a length of the given size. /// If the size is [None], it clears until the end. @@ -77,6 +77,12 @@ pub enum Command { FreeBuffer(BufferProxy), /// Commands to free the image. FreeImage(ImageProxy), + // Discussion question: third argument is vec of resources? + // Maybe use tricks to make more ergonomic? + // Alternative: provide bufs & images as separate sequences + Dispatch(ShaderId, (u32, u32, u32), Vec), + DispatchIndirect(ShaderId, BufferProxy, u64, Vec), + Draw(DrawParams), } /// The type of resource that will be bound to a slot in a shader. @@ -95,6 +101,16 @@ pub enum BindType { // TODO: Uniform, Sampler, maybe others } +pub struct DrawParams { + pub shader_id: ShaderId, + pub instance_count: u32, + pub vertex_count: u32, + pub vertex_buffer: Option, + pub resources: Vec, + pub target: ImageProxy, + pub clear_color: Option<[f32; 4]>, +} + impl Recording { /// Appends a [`Command`] to the back of the [`Recording`]. pub fn push(&mut self, cmd: Command) { @@ -167,6 +183,11 @@ impl Recording { self.push(Command::DispatchIndirect(shader, buf, offset, r)); } + /// Issue a draw call + pub fn draw(&mut self, params: DrawParams) { + self.push(Command::Draw(params)); + } + /// Prepare a buffer for downloading. /// /// Currently this copies to a download buffer. The original buffer can be freed @@ -194,6 +215,11 @@ impl Recording { pub fn free_resource(&mut self, resource: ResourceProxy) { match resource { ResourceProxy::Buffer(buf) => self.free_buffer(buf), + ResourceProxy::BufferRange { + proxy, + offset: _, + size: _, + } => self.free_buffer(proxy), ResourceProxy::Image(image) => self.free_image(image), } } @@ -220,6 +246,15 @@ impl ImageFormat { Self::Bgra8 => wgpu::TextureFormat::Bgra8Unorm, } } + + #[cfg(feature = "wgpu")] + pub fn from_wgpu(format: wgpu::TextureFormat) -> Self { + match format { + wgpu::TextureFormat::Rgba8Unorm => Self::Rgba8, + wgpu::TextureFormat::Bgra8Unorm => Self::Bgra8, + _ => unimplemented!(), + } + } } impl ImageProxy { diff --git a/vello/src/render.rs b/vello/src/render.rs index 8a81d9843..bbd1c8c57 100644 --- a/vello/src/render.rs +++ b/vello/src/render.rs @@ -4,6 +4,7 @@ //! Take an encoded scene and create a graph to render it use std::mem::size_of; +use std::sync::atomic::AtomicBool; use crate::recording::{BufferProxy, ImageFormat, ImageProxy, Recording, ResourceProxy}; use crate::shaders::FullShaders; @@ -19,6 +20,18 @@ pub struct Render { fine_wg_count: Option, fine_resources: Option, mask_buf: Option, + + #[cfg(feature = "debug_layers")] + captured_buffers: Option, +} + +#[cfg(feature = "debug_layers")] +impl Drop for Render { + fn drop(&mut self) { + if self.captured_buffers.is_some() { + unreachable!("Render captured buffers without freeing them"); + } + } } /// Resources produced by pipeline, needed for fine rasterization. @@ -37,6 +50,31 @@ struct FineResources { out_image: ImageProxy, } +/// A collection of internal buffers that are used for debug visualization when the +/// `debug_layers` feature is enabled. The contents of these buffers remain GPU resident +/// and must be freed directly by the caller. +/// +/// Some of these buffers are also scheduled for a download to allow their contents to be +/// processed for CPU-side validation. These buffers are documented as such. +#[cfg(feature = "debug_layers")] +pub struct CapturedBuffers { + pub sizes: vello_encoding::BufferSizes, + + /// Buffers that remain GPU-only + pub path_bboxes: BufferProxy, + + /// Buffers scheduled for download + pub lines: BufferProxy, +} + +#[cfg(feature = "debug_layers")] +impl CapturedBuffers { + pub fn release_buffers(self, recording: &mut Recording) { + recording.free_buffer(self.path_bboxes); + recording.free_buffer(self.lines); + } +} + #[cfg(feature = "wgpu")] pub(crate) fn render_full( scene: &Scene, @@ -77,6 +115,8 @@ impl Render { fine_wg_count: None, fine_resources: None, mask_buf: None, + #[cfg(feature = "debug_layers")] + captured_buffers: None, } } @@ -95,6 +135,7 @@ impl Render { use vello_encoding::RenderConfig; let mut recording = Recording::default(); let mut packed = vec![]; + let (layout, ramps, images) = resolver.resolve(encoding, &mut packed); let gradient_image = if ramps.height == 0 { ResourceProxy::new_image(1, 1, ImageFormat::Rgba8) @@ -107,6 +148,15 @@ impl Render { data, )) }; + if cfg!(not(feature = "debug_layers")) && !params.debug.is_empty() { + static HAS_WARNED: AtomicBool = AtomicBool::new(false); + if !HAS_WARNED.swap(true, std::sync::atomic::Ordering::Release) { + log::warn!( + "Requested debug layers {debug:?} but `debug_layers` feature is not enabled.", + debug = params.debug + ); + } + } let image_atlas = if images.images.is_empty() { ImageProxy::new(1, 1, ImageFormat::Rgba8) } else { @@ -310,7 +360,6 @@ impl Render { ], ); recording.free_resource(draw_monoid_buf); - recording.free_resource(path_bbox_buf); recording.free_resource(clip_bbox_buf); // Note: this only needs to be rounded up because of the workaround to store the tile_offset // in storage rather than workgroup memory. @@ -396,7 +445,6 @@ impl Render { ); recording.free_buffer(indirect_count_buf); recording.free_resource(seg_counts_buf); - recording.free_resource(lines_buf); recording.free_resource(scene_buf); recording.free_resource(draw_monoid_buf); recording.free_resource(bin_header_buf); @@ -419,6 +467,30 @@ impl Render { recording.download(*bump_buf.as_buf().unwrap()); } recording.free_resource(bump_buf); + + #[cfg(feature = "debug_layers")] + { + if robust { + let path_bboxes = *path_bbox_buf.as_buf().unwrap(); + let lines = *lines_buf.as_buf().unwrap(); + recording.download(lines); + + self.captured_buffers = Some(CapturedBuffers { + sizes: cpu_config.buffer_sizes, + path_bboxes, + lines, + }); + } else { + recording.free_resource(path_bbox_buf); + recording.free_resource(lines_buf); + } + } + #[cfg(not(feature = "debug_layers"))] + { + recording.free_resource(path_bbox_buf); + recording.free_resource(lines_buf); + } + recording } @@ -509,4 +581,9 @@ impl Render { .as_buf() .unwrap() } + + #[cfg(feature = "debug_layers")] + pub fn take_captured_buffers(&mut self) -> Option { + self.captured_buffers.take() + } } diff --git a/vello/src/shaders.rs b/vello/src/shaders.rs index 41e13468e..bf34bad46 100644 --- a/vello/src/shaders.rs +++ b/vello/src/shaders.rs @@ -77,7 +77,7 @@ pub(crate) fn full_shaders( .into(); #[cfg(not(feature = "hot_reload"))] let source = shaders.$name.wgsl.code; - engine.add_shader( + engine.add_compute_shader( device, $label, source, diff --git a/vello/src/wgpu_engine.rs b/vello/src/wgpu_engine.rs index 67cd92f56..42ab919db 100644 --- a/vello/src/wgpu_engine.rs +++ b/vello/src/wgpu_engine.rs @@ -11,8 +11,8 @@ use vello_shaders::cpu::CpuBinding; use wgpu::{ BindGroup, BindGroupLayout, Buffer, BufferUsages, CommandEncoder, CommandEncoderDescriptor, - ComputePipeline, Device, PipelineCompilationOptions, Queue, Texture, TextureAspect, - TextureUsages, TextureView, TextureViewDimension, + ComputePipeline, Device, PipelineCompilationOptions, Queue, RenderPipeline, Texture, + TextureAspect, TextureUsages, TextureView, TextureViewDimension, }; use crate::{ @@ -43,8 +43,13 @@ pub(crate) struct WgpuEngine { pub(crate) image_overrides: HashMap>>, } +enum PipelineState { + Compute(ComputePipeline), + Render(RenderPipeline), +} + struct WgpuShader { - pipeline: ComputePipeline, + pipeline: PipelineState, bind_group_layout: BindGroupLayout, } @@ -235,7 +240,7 @@ impl WgpuEngine { /// /// Maybe should do template instantiation here? But shader compilation pipeline feels maybe /// a bit separate. - pub fn add_shader( + pub fn add_compute_shader( &mut self, device: &Device, label: &'static str, @@ -271,54 +276,9 @@ impl WgpuEngine { } } - let entries = layout - .iter() - .enumerate() - .map(|(i, bind_type)| match bind_type { - BindType::Buffer | BindType::BufReadOnly => wgpu::BindGroupLayoutEntry { - binding: i as u32, - visibility: wgpu::ShaderStages::COMPUTE, - ty: wgpu::BindingType::Buffer { - ty: wgpu::BufferBindingType::Storage { - read_only: *bind_type == BindType::BufReadOnly, - }, - has_dynamic_offset: false, - min_binding_size: None, - }, - count: None, - }, - BindType::Uniform => wgpu::BindGroupLayoutEntry { - binding: i as u32, - visibility: wgpu::ShaderStages::COMPUTE, - ty: wgpu::BindingType::Buffer { - ty: wgpu::BufferBindingType::Uniform, - has_dynamic_offset: false, - min_binding_size: None, - }, - count: None, - }, - BindType::Image(format) | BindType::ImageRead(format) => { - wgpu::BindGroupLayoutEntry { - binding: i as u32, - visibility: wgpu::ShaderStages::COMPUTE, - ty: if *bind_type == BindType::ImageRead(*format) { - wgpu::BindingType::Texture { - sample_type: wgpu::TextureSampleType::Float { filterable: true }, - view_dimension: wgpu::TextureViewDimension::D2, - multisampled: false, - } - } else { - wgpu::BindingType::StorageTexture { - access: wgpu::StorageTextureAccess::WriteOnly, - format: format.to_wgpu(), - view_dimension: wgpu::TextureViewDimension::D2, - } - }, - count: None, - } - } - }) - .collect::>(); + let entries = Self::create_bind_group_layout_entries( + layout.iter().map(|b| (*b, wgpu::ShaderStages::COMPUTE)), + ); #[cfg(not(target_arch = "wasm32"))] if let Some(uninit) = self.shaders_to_initialise.as_mut() { let id = add(Shader { @@ -342,6 +302,73 @@ impl WgpuEngine { }) } + #[allow(clippy::too_many_arguments)] + pub fn add_render_shader( + &mut self, + device: &Device, + label: &'static str, + module: &wgpu::ShaderModule, + vertex_main: &'static str, + fragment_main: &'static str, + topology: wgpu::PrimitiveTopology, + color_attachment: wgpu::ColorTargetState, + vertex_buffer: Option, + bind_layout: &[(BindType, wgpu::ShaderStages)], + ) -> ShaderId { + let entries = Self::create_bind_group_layout_entries(bind_layout.iter().copied()); + let bind_group_layout = device.create_bind_group_layout(&wgpu::BindGroupLayoutDescriptor { + label: None, + entries: &entries, + }); + let pipeline_layout = device.create_pipeline_layout(&wgpu::PipelineLayoutDescriptor { + label: None, + bind_group_layouts: &[&bind_group_layout], + push_constant_ranges: &[], + }); + let pipeline = device.create_render_pipeline(&wgpu::RenderPipelineDescriptor { + label: Some(label), + layout: Some(&pipeline_layout), + vertex: wgpu::VertexState { + module, + entry_point: vertex_main, + buffers: vertex_buffer + .as_ref() + .map(core::slice::from_ref) + .unwrap_or_default(), + compilation_options: PipelineCompilationOptions::default(), + }, + fragment: Some(wgpu::FragmentState { + module, + entry_point: fragment_main, + targets: &[Some(color_attachment)], + compilation_options: PipelineCompilationOptions::default(), + }), + primitive: wgpu::PrimitiveState { + topology, + strip_index_format: None, + front_face: wgpu::FrontFace::Ccw, + cull_mode: Some(wgpu::Face::Back), + polygon_mode: wgpu::PolygonMode::Fill, + unclipped_depth: false, + conservative: false, + }, + depth_stencil: None, + multisample: wgpu::MultisampleState::default(), + multiview: None, + cache: None, + }); + let id = self.shaders.len(); + self.shaders.push(Shader { + wgpu: Some(WgpuShader { + pipeline: PipelineState::Render(pipeline), + bind_group_layout, + }), + cpu: None, + label, + }); + ShaderId(id) + } + pub fn run_recording( &mut self, device: &Device, @@ -365,8 +392,11 @@ impl WgpuEngine { transient_map .bufs .insert(buf_proxy.id, TransientBuf::Cpu(bytes)); - let usage = - BufferUsages::COPY_SRC | BufferUsages::COPY_DST | BufferUsages::STORAGE; + // TODO: restrict VERTEX usage to "debug_layers" feature? + let usage = BufferUsages::COPY_SRC + | BufferUsages::COPY_DST + | BufferUsages::STORAGE + | BufferUsages::VERTEX; let buf = self .pool .get_buf(buf_proxy.size, buf_proxy.name, usage, device); @@ -523,7 +553,10 @@ impl WgpuEngine { let query = profiler .begin_query(shader.label, &mut cpass, device) .with_parent(Some(&query)); - cpass.set_pipeline(&wgpu_shader.pipeline); + let PipelineState::Compute(pipeline) = &wgpu_shader.pipeline else { + panic!("cannot issue a dispatch with a render pipeline"); + }; + cpass.set_pipeline(pipeline); cpass.set_bind_group(0, &bind_group, &[]); cpass.dispatch_workgroups(x, y, z); #[cfg(feature = "wgpu-profiler")] @@ -570,7 +603,10 @@ impl WgpuEngine { let query = profiler .begin_query(shader.label, &mut cpass, device) .with_parent(Some(&query)); - cpass.set_pipeline(&wgpu_shader.pipeline); + let PipelineState::Compute(pipeline) = &wgpu_shader.pipeline else { + panic!("cannot issue a dispatch with a render pipeline"); + }; + cpass.set_pipeline(pipeline); cpass.set_bind_group(0, &bind_group, &[]); let buf = self.bind_map.get_gpu_buf(proxy.id).ok_or( Error::UnavailableBufferUsed(proxy.name, "indirect dispatch"), @@ -581,6 +617,68 @@ impl WgpuEngine { } } } + Command::Draw(draw_params) => { + let shader = &self.shaders[draw_params.shader_id.0]; + #[cfg(feature = "wgpu-profiler")] + let label = shader.label; + let ShaderKind::Wgpu(shader) = shader.select() else { + panic!("a render pass does not have a CPU equivalent"); + }; + let bind_group = transient_map.create_bind_group( + &mut self.bind_map, + &mut self.pool, + device, + queue, + &mut encoder, + &shader.bind_group_layout, + &draw_params.resources, + ); + let render_target = transient_map + .materialize_external_image_for_render_pass(&draw_params.target); + let mut rpass = encoder.begin_render_pass(&wgpu::RenderPassDescriptor { + label: None, + color_attachments: &[Some(wgpu::RenderPassColorAttachment { + view: render_target, + resolve_target: None, + ops: wgpu::Operations { + load: match draw_params.clear_color { + Some(c) => wgpu::LoadOp::Clear(wgpu::Color { + r: c[0] as f64, + g: c[1] as f64, + b: c[2] as f64, + a: c[3] as f64, + }), + None => wgpu::LoadOp::Load, + }, + store: wgpu::StoreOp::Store, + }, + })], + depth_stencil_attachment: None, + occlusion_query_set: None, + timestamp_writes: None, + }); + #[cfg(feature = "wgpu-profiler")] + let query = profiler + .begin_query(label, &mut rpass, device) + .with_parent(Some(&query)); + let PipelineState::Render(pipeline) = &shader.pipeline else { + panic!("cannot issue a draw with a compute pipeline"); + }; + rpass.set_pipeline(pipeline); + if let Some(proxy) = draw_params.vertex_buffer { + // TODO: need a way to materialize a CPU initialized buffer. For now assume + // buffer exists? Also, need to materialize this buffer with vertex usage + let buf = self + .bind_map + .get_gpu_buf(proxy.id) + .ok_or(Error::UnavailableBufferUsed(proxy.name, "draw"))?; + rpass.set_vertex_buffer(0, buf.slice(..)); + } + rpass.set_bind_group(0, &bind_group, &[]); + rpass.draw(0..draw_params.vertex_count, 0..draw_params.instance_count); + #[cfg(feature = "wgpu-profiler")] + profiler.end_query(&mut rpass, query); + } Command::Download(proxy) => { let src_buf = self .bind_map @@ -617,6 +715,9 @@ impl WgpuEngine { } #[cfg(feature = "wgpu-profiler")] profiler.end_query(&mut encoder, query); + // TODO: This only actually needs to happen once per frame, but run_recording happens two or three times + #[cfg(feature = "wgpu-profiler")] + profiler.resolve_queries(&mut encoder); queue.submit(Some(encoder.finish())); for id in free_bufs { if let Some(buf) = self.bind_map.buf_map.remove(&id) { @@ -649,6 +750,58 @@ impl WgpuEngine { self.downloads.remove(&buf.id); } + fn create_bind_group_layout_entries( + layout: impl Iterator, + ) -> Vec { + layout + .enumerate() + .map(|(i, (bind_type, visibility))| match bind_type { + BindType::Buffer | BindType::BufReadOnly => wgpu::BindGroupLayoutEntry { + binding: i as u32, + visibility, + ty: wgpu::BindingType::Buffer { + ty: wgpu::BufferBindingType::Storage { + read_only: bind_type == BindType::BufReadOnly, + }, + has_dynamic_offset: false, + min_binding_size: None, + }, + count: None, + }, + BindType::Uniform => wgpu::BindGroupLayoutEntry { + binding: i as u32, + visibility, + ty: wgpu::BindingType::Buffer { + ty: wgpu::BufferBindingType::Uniform, + has_dynamic_offset: false, + min_binding_size: None, + }, + count: None, + }, + BindType::Image(format) | BindType::ImageRead(format) => { + wgpu::BindGroupLayoutEntry { + binding: i as u32, + visibility, + ty: if bind_type == BindType::ImageRead(format) { + wgpu::BindingType::Texture { + sample_type: wgpu::TextureSampleType::Float { filterable: true }, + view_dimension: wgpu::TextureViewDimension::D2, + multisampled: false, + } + } else { + wgpu::BindingType::StorageTexture { + access: wgpu::StorageTextureAccess::WriteOnly, + format: format.to_wgpu(), + view_dimension: wgpu::TextureViewDimension::D2, + } + }, + count: None, + } + } + }) + .collect::>() + } + fn create_compute_pipeline( device: &Device, label: &str, @@ -682,7 +835,7 @@ impl WgpuEngine { cache: None, }); WgpuShader { - pipeline, + pipeline: PipelineState::Compute(pipeline), bind_group_layout, } } @@ -879,6 +1032,14 @@ impl<'a> TransientBindMap<'a> { } } + fn materialize_external_image_for_render_pass(&mut self, proxy: &ImageProxy) -> &TextureView { + // TODO: Maybe this should support instantiating a transient texture. Right now all render + // passes target a `SurfaceTexture`, so supporting external textures is sufficient. + self.images + .get(&proxy.id) + .expect("texture not materialized") + } + #[allow(clippy::too_many_arguments)] fn create_bind_group( &mut self, @@ -892,17 +1053,23 @@ impl<'a> TransientBindMap<'a> { ) -> BindGroup { for proxy in bindings { match proxy { - ResourceProxy::Buffer(proxy) => { + ResourceProxy::Buffer(proxy) + | ResourceProxy::BufferRange { + proxy, + offset: _, + size: _, + } => { if self.bufs.contains_key(&proxy.id) { continue; } match bind_map.buf_map.entry(proxy.id) { Entry::Vacant(v) => { - // TODO: only some buffers will need indirect, but does it hurt? + // TODO: only some buffers will need indirect & vertex, but does it hurt? let usage = BufferUsages::COPY_SRC | BufferUsages::COPY_DST | BufferUsages::STORAGE - | BufferUsages::INDIRECT; + | BufferUsages::INDIRECT + | BufferUsages::VERTEX; let buf = pool.get_buf(proxy.size, proxy.name, usage, device); if bind_map.pending_clears.remove(&proxy.id) { encoder.clear_buffer(&buf, 0, None); @@ -966,6 +1133,24 @@ impl<'a> TransientBindMap<'a> { resource: buf.as_entire_binding(), } } + ResourceProxy::BufferRange { + proxy, + offset, + size, + } => { + let buf = match self.bufs.get(&proxy.id) { + Some(TransientBuf::Gpu(b)) => b, + _ => bind_map.get_gpu_buf(proxy.id).unwrap(), + }; + wgpu::BindGroupEntry { + binding: i as u32, + resource: wgpu::BindingResource::Buffer(wgpu::BufferBinding { + buffer: buf, + offset: *offset, + size: core::num::NonZeroU64::new(*size), + }), + } + } ResourceProxy::Image(proxy) => { let view = self .images @@ -995,10 +1180,15 @@ impl<'a> TransientBindMap<'a> { // First pass is mutable; create buffers as needed for resource in bindings { match resource { - ResourceProxy::Buffer(buf) => match self.bufs.get(&buf.id) { + ResourceProxy::Buffer(proxy) + | ResourceProxy::BufferRange { + proxy, + offset: _, + size: _, + } => match self.bufs.get(&proxy.id) { Some(TransientBuf::Cpu(_)) => (), Some(TransientBuf::Gpu(_)) => panic!("buffer was already materialized on GPU"), - _ => bind_map.materialize_cpu_buf(buf), + _ => bind_map.materialize_cpu_buf(proxy), }, ResourceProxy::Image(_) => todo!(), }; @@ -1011,6 +1201,7 @@ impl<'a> TransientBindMap<'a> { Some(TransientBuf::Cpu(b)) => CpuBinding::Buffer(b), _ => bind_map.get_cpu_buf(buf.id), }, + ResourceProxy::BufferRange { .. } => todo!(), ResourceProxy::Image(_) => todo!(), }) .collect() diff --git a/vello_tests/src/lib.rs b/vello_tests/src/lib.rs index e2b6c7898..e97910593 100644 --- a/vello_tests/src/lib.rs +++ b/vello_tests/src/lib.rs @@ -98,6 +98,7 @@ pub async fn get_scene_image(params: &TestParams, scene: &Scene) -> Result