diff --git a/examples/headless/src/main.rs b/examples/headless/src/main.rs
index c5cb8b06b..98d00c3f8 100644
--- a/examples/headless/src/main.rs
+++ b/examples/headless/src/main.rs
@@ -139,6 +139,7 @@ async fn render(mut scenes: SceneSet, index: usize, args: &Args) -> Result<()> {
         width,
         height,
         antialiasing_method: vello::AaConfig::Area,
+        debug: vello::DebugLayers::none(),
     };
     let mut scene = Scene::new();
     scene.append(&fragment, Some(transform));
diff --git a/examples/simple/src/main.rs b/examples/simple/src/main.rs
index 5e5c581ad..bbf721645 100644
--- a/examples/simple/src/main.rs
+++ b/examples/simple/src/main.rs
@@ -7,7 +7,7 @@ use std::sync::Arc;
 use vello::kurbo::{Affine, Circle, Ellipse, Line, RoundedRect, Stroke};
 use vello::peniko::Color;
 use vello::util::{RenderContext, RenderSurface};
-use vello::{AaConfig, Renderer, RendererOptions, Scene};
+use vello::{AaConfig, DebugLayers, Renderer, RendererOptions, Scene};
 use winit::application::ApplicationHandler;
 use winit::dpi::LogicalSize;
 use winit::event::*;
@@ -151,6 +151,7 @@ impl<'s> ApplicationHandler for SimpleVelloApp<'s> {
                             width,
                             height,
                             antialiasing_method: AaConfig::Msaa16,
+                            debug: DebugLayers::none(),
                         },
                     )
                     .expect("failed to render to surface");
diff --git a/examples/with_winit/Cargo.toml b/examples/with_winit/Cargo.toml
index 1d508d4e8..783e233d2 100644
--- a/examples/with_winit/Cargo.toml
+++ b/examples/with_winit/Cargo.toml
@@ -25,8 +25,9 @@ workspace = true
 name = "with_winit_bin"
 path = "src/main.rs"
 
+
 [dependencies]
-vello = { workspace = true, features = ["buffer_labels"] }
+vello = { workspace = true, features = ["buffer_labels", "debug_layers"] }
 scenes = { workspace = true }
 
 anyhow = { workspace = true }
diff --git a/examples/with_winit/src/lib.rs b/examples/with_winit/src/lib.rs
index 4c6bc97a3..0e8678149 100644
--- a/examples/with_winit/src/lib.rs
+++ b/examples/with_winit/src/lib.rs
@@ -162,6 +162,8 @@ struct VelloApp<'s> {
 
     prev_scene_ix: i32,
     modifiers: ModifiersState,
+
+    debug: vello::DebugLayers,
 }
 
 impl<'s> ApplicationHandler<UserEvent> for VelloApp<'s> {
@@ -329,6 +331,27 @@ impl<'s> ApplicationHandler<UserEvent> for VelloApp<'s> {
                                         },
                                     );
                                 }
+                                debug_layer @ ("1" | "2" | "3" | "4") => {
+                                    match debug_layer {
+                                        "1" => {
+                                            self.debug.toggle(vello::DebugLayers::BOUNDING_BOXES);
+                                        }
+                                        "2" => {
+                                            self.debug
+                                                .toggle(vello::DebugLayers::LINESOUP_SEGMENTS);
+                                        }
+                                        "3" => {
+                                            self.debug.toggle(vello::DebugLayers::LINESOUP_POINTS);
+                                        }
+                                        "4" => {
+                                            self.debug.toggle(vello::DebugLayers::VALIDATION);
+                                        }
+                                        _ => unreachable!(),
+                                    }
+                                    if !self.debug.is_empty() && !self.async_pipeline {
+                                        log::warn!("Debug Layers won't work without using `--async-pipeline`. Requested {:?}", self.debug);
+                                    }
+                                }
                                 _ => {}
                             }
                         }
@@ -464,6 +487,7 @@ impl<'s> ApplicationHandler<UserEvent> for VelloApp<'s> {
                     width,
                     height,
                     antialiasing_method,
+                    debug: self.debug,
                 };
                 self.scene.reset();
                 let mut transform = self.transform;
@@ -674,6 +698,8 @@ fn run(
         Some(render_state)
     };
 
+    let debug = vello::DebugLayers::none();
+
     let mut app = VelloApp {
         context: render_cx,
         renderers,
@@ -718,6 +744,7 @@ fn run(
         complexity: 0,
         prev_scene_ix: 0,
         modifiers: ModifiersState::default(),
+        debug,
     };
 
     event_loop.run_app(&mut app).expect("run to completion");
@@ -786,6 +813,7 @@ pub fn main() -> anyhow::Result<()> {
     #[cfg(not(target_arch = "wasm32"))]
     env_logger::builder()
         .format_timestamp(Some(env_logger::TimestampPrecision::Millis))
+        .filter_level(log::LevelFilter::Warn)
         .init();
     let args = parse_arguments();
     let scenes = args.args.select_scene_set()?;
diff --git a/vello/Cargo.toml b/vello/Cargo.toml
index 1fd55c6b3..83ba98d55 100644
--- a/vello/Cargo.toml
+++ b/vello/Cargo.toml
@@ -18,6 +18,7 @@ default = ["wgpu"]
 bump_estimate = ["vello_encoding/bump_estimate"]
 hot_reload = ["vello_shaders/compile"]
 buffer_labels = []
+debug_layers = []
 wgpu = ["dep:wgpu"]
 wgpu-profiler = ["dep:wgpu-profiler"]
 
diff --git a/vello/src/debug.rs b/vello/src/debug.rs
new file mode 100644
index 000000000..5ddf89e1d
--- /dev/null
+++ b/vello/src/debug.rs
@@ -0,0 +1,119 @@
+// Copyright 2023 the Vello Authors
+// SPDX-License-Identifier: Apache-2.0 OR MIT
+
+#[cfg(all(feature = "debug_layers", feature = "wgpu"))]
+mod renderer;
+#[cfg(all(feature = "debug_layers", feature = "wgpu"))]
+mod validate;
+
+use std::fmt::Debug;
+
+#[cfg(all(feature = "debug_layers", feature = "wgpu"))]
+pub(crate) use renderer::*;
+
+/// Bitflags for enabled debug operations.
+///
+/// Currently, all layers additionally require the `debug_layers` feature.
+#[derive(Copy, Clone)]
+pub struct DebugLayers(u8);
+
+impl Debug for DebugLayers {
+    fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result {
+        let mut tuple = f.debug_tuple("DebugLayers");
+        if self.contains(Self::BOUNDING_BOXES) {
+            tuple.field(&"BOUNDING_BOXES");
+        }
+        if self.contains(Self::LINESOUP_SEGMENTS) {
+            tuple.field(&"LINESOUP_SEGMENTS");
+        }
+        if self.contains(Self::LINESOUP_POINTS) {
+            tuple.field(&"LINESOUP_POINTS");
+        }
+        if self.contains(Self::VALIDATION) {
+            tuple.field(&"VALIDATION");
+        }
+
+        tuple.finish()
+    }
+}
+
+// TODO: Currently all layers require read-back of the BumpAllocators buffer. This isn't strictly
+// necessary for layers other than `VALIDATION`. The debug visualizations use the bump buffer only
+// to obtain various instance counts for draws and these could instead get written out to an
+// indirect draw buffer. OTOH `VALIDATION` should always require readback since we want to be able
+// to run the same CPU-side tests for both CPU and GPU shaders.
+impl DebugLayers {
+    /// Visualize the bounding box of every path.
+    /// Requires the `debug_layers` feature.
+    pub const BOUNDING_BOXES: DebugLayers = DebugLayers(1 << 0);
+
+    /// Visualize the post-flattening line segments using line primitives.
+    /// Requires the `debug_layers` feature.
+    pub const LINESOUP_SEGMENTS: DebugLayers = DebugLayers(1 << 1);
+
+    /// Visualize the post-flattening line endpoints.
+    /// Requires the `debug_layers` feature.
+    pub const LINESOUP_POINTS: DebugLayers = DebugLayers(1 << 2);
+
+    /// Enable validation of internal buffer contents and visualize errors. Validation tests are
+    /// run on the CPU and require buffer contents to be read-back.
+    ///
+    /// Supported validation tests:
+    ///
+    ///    - Watertightness: validate that every line segment within a path is connected without
+    ///      any gaps. Line endpoints that don't precisely overlap another endpoint get visualized
+    ///      as red circles and logged to stderr.
+    ///
+    /// Requires the `debug_layers` feature.
+    pub const VALIDATION: DebugLayers = DebugLayers(1 << 3);
+
+    /// Construct a `DebugLayers` from the raw bits.
+    pub const fn from_bits(bits: u8) -> Self {
+        Self(bits)
+    }
+
+    /// Get the raw representation of this value.
+    pub const fn bits(self) -> u8 {
+        self.0
+    }
+
+    /// A `DebugLayers` with no layers enabled.
+    pub const fn none() -> Self {
+        Self(0)
+    }
+
+    /// A `DebugLayers` with all layers enabled.
+    pub const fn all() -> Self {
+        // Custom BitOr is not const, so need to manipulate the inner value here
+        Self(
+            Self::BOUNDING_BOXES.0
+                | Self::LINESOUP_SEGMENTS.0
+                | Self::LINESOUP_POINTS.0
+                | Self::VALIDATION.0,
+        )
+    }
+
+    /// True if this `DebugLayers` has no layers enabled.
+    pub const fn is_empty(self) -> bool {
+        self.0 == 0
+    }
+
+    /// Determine whether `self` is a superset of `mask`.
+    pub const fn contains(self, mask: DebugLayers) -> bool {
+        self.0 & mask.0 == mask.0
+    }
+
+    /// Toggle the value of the layers specified in mask.
+    pub fn toggle(&mut self, mask: DebugLayers) {
+        self.0 ^= mask.0;
+    }
+}
+
+/// Returns the union of the two input `DebugLayers`.
+impl std::ops::BitOr for DebugLayers {
+    type Output = Self;
+
+    fn bitor(self, rhs: Self) -> Self {
+        Self(self.0 | rhs.0)
+    }
+}
diff --git a/vello/src/debug/renderer.rs b/vello/src/debug/renderer.rs
new file mode 100644
index 000000000..534607894
--- /dev/null
+++ b/vello/src/debug/renderer.rs
@@ -0,0 +1,497 @@
+// Copyright 2023 the Vello Authors
+// SPDX-License-Identifier: Apache-2.0 OR MIT
+
+use super::DebugLayers;
+use crate::{
+    debug::validate::{validate_line_soup, LineEndpoint},
+    recording::{BindType, DrawParams, ImageProxy, Recording, ResourceProxy, ShaderId},
+    render::CapturedBuffers,
+    wgpu_engine::WgpuEngine,
+    DebugDownloads, RenderParams,
+};
+
+use {
+    bytemuck::{offset_of, Pod, Zeroable},
+    peniko::Color,
+    vello_encoding::{BumpAllocators, LineSoup, PathBbox},
+};
+pub(crate) struct DebugRenderer {
+    // `clear_tint` slightly darkens the output from the vello renderer to make the debug overlays
+    // more distinguishable.
+    clear_tint: ShaderId,
+    bboxes: ShaderId,
+    linesoup: ShaderId,
+    linesoup_points: ShaderId,
+    unpaired_points: ShaderId,
+}
+
+impl DebugRenderer {
+    pub fn new(
+        device: &wgpu::Device,
+        target_format: wgpu::TextureFormat,
+        engine: &mut WgpuEngine,
+    ) -> Self {
+        let module = device.create_shader_module(wgpu::ShaderModuleDescriptor {
+            label: Some("debug layers"),
+            source: wgpu::ShaderSource::Wgsl(SHADERS.into()),
+        });
+
+        let clear_tint = engine.add_render_shader(
+            device,
+            "clear-tint",
+            &module,
+            "full_screen_quad_vert",
+            "solid_color_frag",
+            wgpu::PrimitiveTopology::TriangleStrip,
+            wgpu::ColorTargetState {
+                format: target_format,
+                blend: Some(wgpu::BlendState {
+                    color: wgpu::BlendComponent {
+                        src_factor: wgpu::BlendFactor::SrcAlpha,
+                        dst_factor: wgpu::BlendFactor::OneMinusSrcAlpha,
+                        operation: wgpu::BlendOperation::Add,
+                    },
+                    alpha: wgpu::BlendComponent::OVER,
+                }),
+                write_mask: wgpu::ColorWrites::ALL,
+            },
+            None,
+            &[],
+        );
+        let bboxes = engine.add_render_shader(
+            device,
+            "bbox-debug",
+            &module,
+            "bbox_vert",
+            "solid_color_frag",
+            wgpu::PrimitiveTopology::LineStrip,
+            wgpu::ColorTargetState {
+                format: target_format,
+                blend: None,
+                write_mask: wgpu::ColorWrites::ALL,
+            },
+            // This mirrors the layout of the PathBbox structure.
+            Some(wgpu::VertexBufferLayout {
+                array_stride: std::mem::size_of::<PathBbox>() as u64,
+                step_mode: wgpu::VertexStepMode::Instance,
+                attributes: &[
+                    wgpu::VertexAttribute {
+                        format: wgpu::VertexFormat::Sint32x2,
+                        offset: offset_of!(PathBbox, x0) as u64,
+                        shader_location: 0,
+                    },
+                    wgpu::VertexAttribute {
+                        format: wgpu::VertexFormat::Sint32x2,
+                        offset: offset_of!(PathBbox, x1) as u64,
+                        shader_location: 1,
+                    },
+                ],
+            }),
+            &[(BindType::Uniform, wgpu::ShaderStages::VERTEX)],
+        );
+        let linesoup = engine.add_render_shader(
+            device,
+            "linesoup-debug",
+            &module,
+            "linesoup_vert",
+            "solid_color_frag",
+            wgpu::PrimitiveTopology::TriangleStrip,
+            wgpu::ColorTargetState {
+                format: target_format,
+                blend: None,
+                write_mask: wgpu::ColorWrites::ALL,
+            },
+            // This mirrors the layout of the LineSoup structure.
+            Some(wgpu::VertexBufferLayout {
+                array_stride: std::mem::size_of::<LineSoup>() as u64,
+                step_mode: wgpu::VertexStepMode::Instance,
+                attributes: &[
+                    wgpu::VertexAttribute {
+                        format: wgpu::VertexFormat::Float32x2,
+                        offset: offset_of!(LineSoup, p0) as u64,
+                        shader_location: 0,
+                    },
+                    wgpu::VertexAttribute {
+                        format: wgpu::VertexFormat::Float32x2,
+                        offset: offset_of!(LineSoup, p1) as u64,
+                        shader_location: 1,
+                    },
+                ],
+            }),
+            &[(BindType::Uniform, wgpu::ShaderStages::VERTEX)],
+        );
+        let linesoup_points = engine.add_render_shader(
+            device,
+            "linepoints-debug",
+            &module,
+            "linepoints_vert",
+            "sdf_circle_frag",
+            wgpu::PrimitiveTopology::TriangleStrip,
+            wgpu::ColorTargetState {
+                format: target_format,
+                blend: Some(wgpu::BlendState {
+                    color: wgpu::BlendComponent {
+                        src_factor: wgpu::BlendFactor::SrcAlpha,
+                        dst_factor: wgpu::BlendFactor::OneMinusSrcAlpha,
+                        operation: wgpu::BlendOperation::Add,
+                    },
+                    alpha: wgpu::BlendComponent::OVER,
+                }),
+                write_mask: wgpu::ColorWrites::ALL,
+            },
+            // This mirrors the layout of the LineSoup structure. The pipeline only processes the
+            // first point of each line. Since all points should be paired, this is enough to
+            // render all points. All unpaired points alone get drawn by the `unpaired_points`
+            // pipeline, so no point should get missed.
+            Some(wgpu::VertexBufferLayout {
+                array_stride: std::mem::size_of::<LineSoup>() as u64,
+                step_mode: wgpu::VertexStepMode::Instance,
+                attributes: &[wgpu::VertexAttribute {
+                    format: wgpu::VertexFormat::Float32x2,
+                    offset: offset_of!(LineSoup, p0) as u64,
+                    shader_location: 0,
+                }],
+            }),
+            &[
+                (BindType::Uniform, wgpu::ShaderStages::VERTEX),
+                (
+                    BindType::Uniform,
+                    wgpu::ShaderStages::VERTEX | wgpu::ShaderStages::FRAGMENT,
+                ),
+            ],
+        );
+        let unpaired_points = engine.add_render_shader(
+            device,
+            "linepoints-debug",
+            &module,
+            "linepoints_vert",
+            "sdf_circle_frag",
+            wgpu::PrimitiveTopology::TriangleStrip,
+            wgpu::ColorTargetState {
+                format: target_format,
+                blend: Some(wgpu::BlendState {
+                    color: wgpu::BlendComponent {
+                        src_factor: wgpu::BlendFactor::SrcAlpha,
+                        dst_factor: wgpu::BlendFactor::OneMinusSrcAlpha,
+                        operation: wgpu::BlendOperation::Add,
+                    },
+                    alpha: wgpu::BlendComponent::OVER,
+                }),
+                write_mask: wgpu::ColorWrites::ALL,
+            },
+            // This mirrors the layout of the LineSoup structure.
+            Some(wgpu::VertexBufferLayout {
+                array_stride: std::mem::size_of::<LineEndpoint>() as u64,
+                step_mode: wgpu::VertexStepMode::Instance,
+                attributes: &[wgpu::VertexAttribute {
+                    format: wgpu::VertexFormat::Float32x2,
+                    offset: offset_of!(LineEndpoint, x) as u64,
+                    shader_location: 0,
+                }],
+            }),
+            &[
+                (BindType::Uniform, wgpu::ShaderStages::VERTEX),
+                (
+                    BindType::Uniform,
+                    wgpu::ShaderStages::VERTEX | wgpu::ShaderStages::FRAGMENT,
+                ),
+            ],
+        );
+
+        Self {
+            clear_tint,
+            bboxes,
+            linesoup,
+            linesoup_points,
+            unpaired_points,
+        }
+    }
+
+    pub fn render(
+        &self,
+        recording: &mut Recording,
+        target: ImageProxy,
+        captured: &CapturedBuffers,
+        bump: &BumpAllocators,
+        params: &RenderParams,
+        downloads: &DebugDownloads,
+    ) {
+        if params.debug.is_empty() {
+            return;
+        }
+
+        let (unpaired_pts_len, unpaired_pts_buf) = if params.debug.contains(DebugLayers::VALIDATION)
+        {
+            // TODO: have this write directly to a GPU buffer?
+            let unpaired_pts: Vec<LineEndpoint> =
+                validate_line_soup(bytemuck::cast_slice(&downloads.lines.get_mapped_range()));
+            if unpaired_pts.is_empty() {
+                (0, None)
+            } else {
+                (
+                    unpaired_pts.len(),
+                    Some(
+                        recording
+                            .upload("unpaired points", bytemuck::cast_slice(&unpaired_pts[..])),
+                    ),
+                )
+            }
+        } else {
+            (0, None)
+        };
+
+        let uniforms = Uniforms {
+            width: params.width,
+            height: params.height,
+        };
+        let uniforms_buf = ResourceProxy::Buffer(
+            recording.upload_uniform("uniforms", bytemuck::bytes_of(&uniforms)),
+        );
+
+        let linepoints_uniforms = [
+            LinepointsUniforms::new(Color::DARK_CYAN, 10.),
+            LinepointsUniforms::new(Color::RED, 80.),
+        ];
+        let linepoints_uniforms_buf = recording.upload_uniform(
+            "linepoints uniforms",
+            bytemuck::bytes_of(&linepoints_uniforms),
+        );
+
+        recording.draw(DrawParams {
+            shader_id: self.clear_tint,
+            instance_count: 1,
+            vertex_count: 4,
+            vertex_buffer: None,
+            resources: vec![],
+            target,
+            clear_color: None,
+        });
+        if params.debug.contains(DebugLayers::BOUNDING_BOXES) {
+            recording.draw(DrawParams {
+                shader_id: self.bboxes,
+                instance_count: captured.sizes.path_bboxes.len(),
+                vertex_count: 5,
+                vertex_buffer: Some(captured.path_bboxes),
+                resources: vec![uniforms_buf],
+                target,
+                clear_color: None,
+            });
+        }
+        if params.debug.contains(DebugLayers::LINESOUP_SEGMENTS) {
+            recording.draw(DrawParams {
+                shader_id: self.linesoup,
+                instance_count: bump.lines,
+                vertex_count: 4,
+                vertex_buffer: Some(captured.lines),
+                resources: vec![uniforms_buf],
+                target,
+                clear_color: None,
+            });
+        }
+        if params.debug.contains(DebugLayers::LINESOUP_POINTS) {
+            recording.draw(DrawParams {
+                shader_id: self.linesoup_points,
+                instance_count: bump.lines,
+                vertex_count: 4,
+                vertex_buffer: Some(captured.lines),
+                resources: vec![
+                    uniforms_buf,
+                    ResourceProxy::BufferRange {
+                        proxy: linepoints_uniforms_buf,
+                        offset: 0,
+                        size: std::mem::size_of::<LinepointsUniforms>() as u64,
+                    },
+                ],
+                target,
+                clear_color: None,
+            });
+        }
+        if let Some(unpaired_pts_buf) = unpaired_pts_buf {
+            recording.draw(DrawParams {
+                shader_id: self.unpaired_points,
+                instance_count: unpaired_pts_len.try_into().unwrap(),
+                vertex_count: 4,
+                vertex_buffer: Some(unpaired_pts_buf),
+                resources: vec![
+                    uniforms_buf,
+                    ResourceProxy::BufferRange {
+                        proxy: linepoints_uniforms_buf,
+                        offset: std::mem::size_of::<LinepointsUniforms>() as u64,
+                        size: std::mem::size_of::<LinepointsUniforms>() as u64,
+                    },
+                ],
+                target,
+                clear_color: None,
+            });
+            recording.free_buffer(unpaired_pts_buf);
+        }
+
+        recording.free_resource(uniforms_buf);
+        recording.free_buffer(linepoints_uniforms_buf);
+    }
+}
+
+#[derive(Copy, Clone, Zeroable, Pod)]
+#[repr(C)]
+struct Uniforms {
+    width: u32,
+    height: u32,
+}
+
+#[derive(Copy, Clone, Zeroable, Pod)]
+#[repr(C)]
+struct LinepointsUniforms {
+    point_color: [f32; 3],
+    point_size: f32,
+    // Uniform parameters for individual SDF point draws are stored in a single buffer.
+    // This 240 byte padding is here to bring the element offset alignment of 256 bytes.
+    // (see https://www.w3.org/TR/webgpu/#dom-supported-limits-minuniformbufferoffsetalignment)
+    _pad0: [u32; 30],
+    _pad1: [u32; 30],
+}
+
+impl LinepointsUniforms {
+    fn new(color: Color, point_size: f32) -> Self {
+        Self {
+            point_color: [
+                color.r as f32 / 255.,
+                color.g as f32 / 255.,
+                color.b as f32 / 255.,
+            ],
+            point_size,
+            _pad0: [0; 30],
+            _pad1: [0; 30],
+        }
+    }
+}
+
+const SHADERS: &str = r#"
+
+// Map from y-down normalized coordinates to NDC:
+fn map_to_ndc(p: vec2f) -> vec4f {
+    return vec4(vec2(1., -1.) * (2. * p - vec2(1.)), 0., 1.);
+}
+
+alias QuadVertices = array<vec2f, 4>;
+var<private> quad_vertices: QuadVertices = QuadVertices(
+    vec2<f32>(0., 1.),
+    vec2<f32>(0., 0.),
+    vec2<f32>(1., 0.),
+    vec2<f32>(1., 1.),
+);
+
+var<private> quad_fill_indices: array<u32, 4> = array<u32, 4>(0u, 3u, 1u, 2u);
+
+struct Uniforms {
+    width: u32,
+    height: u32,
+}
+@binding(0) @group(0) var<uniform> uniforms: Uniforms;
+
+struct VSOut {
+    @builtin(position) pos: vec4f,
+    @location(0) color: vec4f,
+}
+
+////////////
+
+@vertex
+fn full_screen_quad_vert(@builtin(vertex_index) vid: u32) -> VSOut {
+    let p = quad_vertices[quad_fill_indices[vid]];
+    // TODO: Make the alpha configurable here.
+    // The clear tint is a full-screen layer above the entire image with this color.
+    return VSOut(map_to_ndc(p), vec4(0., 0., 0., 0.2));
+}
+
+////////////
+
+struct BboxIn {
+	@location(0) p0: vec2i,
+	@location(1) p1: vec2i,
+}
+
+@vertex
+fn bbox_vert(@builtin(vertex_index) vid: u32, bbox: BboxIn) -> VSOut {
+    let ul = vec2f(f32(bbox.p0.x), f32(bbox.p0.y));
+    let br = vec2f(f32(bbox.p1.x), f32(bbox.p1.y));
+    let dim = br - ul;
+    let p = (ul + dim * quad_vertices[vid % 4u]) / vec2f(f32(uniforms.width), f32(uniforms.height));
+    return VSOut(map_to_ndc(p), vec4(0., 1., 0., 1.));
+}
+
+////////////
+
+struct LinesoupIn {
+    @location(0) p0: vec2f,
+    @location(1) p1: vec2f,
+}
+
+const LINE_THICKNESS: f32 = 4.;
+const WIND_DOWN_COLOR: vec3f = vec3(0., 1., 0.);
+const WIND_UP_COLOR: vec3f = vec3(1., 0., 0.);
+
+@vertex
+fn linesoup_vert(@builtin(vertex_index) vid: u32, line: LinesoupIn) -> VSOut {
+    let quad_corner = quad_vertices[quad_fill_indices[vid]] - vec2(0.5);
+    let v = line.p1 - line.p0;
+    let m = mix(line.p0, line.p1, 0.5);
+    let s = vec2(LINE_THICKNESS, length(v));
+    let vn = normalize(v);
+    let r = mat2x2(vn.y, -vn.x, vn.x, vn.y);
+    let p = (m + r * (s * quad_corner)) / vec2f(f32(uniforms.width), f32(uniforms.height));
+    //let color = vec4(0.7, 0.5, 0., 1.);
+    let color = vec4(select(WIND_UP_COLOR, WIND_DOWN_COLOR, v.y >= 0.), 1.);
+    return VSOut(map_to_ndc(p), color);
+}
+
+////////////
+
+struct LinepointsUniforms {
+    point_color: vec3f,
+    point_size: f32,
+}
+@binding(1) @group(0) var<uniform> linepoints_uniforms: LinepointsUniforms;
+
+struct SDFCircleOut {
+    @builtin(position) pos: vec4f,
+
+    // Unpremultiplied color of the circle.
+    @location(0) color: vec3f,
+
+    // The 2D position of the pixel fragment relative to the center of the quad. The quad edges
+    // are at coordinates (±1, 0) and (0, ±1).
+    @location(1) quad_relative: vec2f,
+}
+
+@vertex
+fn linepoints_vert(@builtin(vertex_index) vid: u32, @location(0) point: vec2f) -> SDFCircleOut {
+    let quad_corner = quad_vertices[quad_fill_indices[vid]] - vec2(0.5);
+    let rect_dim = vec2(linepoints_uniforms.point_size);
+    let p = (point + rect_dim * quad_corner) / vec2(f32(uniforms.width), f32(uniforms.height));
+
+    return SDFCircleOut(
+        map_to_ndc(p),
+        linepoints_uniforms.point_color,
+        // Normalize the corners of the quad such that they form a vector of length √2. This should
+        // align the edge fragments to ±1. The post-interpolation values of `quad_relative` will
+        // then form a distance field that can represent a circle of radius 1 within the quad
+        // (where the distance is relative to the center of the circle).
+        normalize(quad_corner) * sqrt(2.),
+    );
+}
+
+@fragment
+fn solid_color_frag(in: VSOut) -> @location(0) vec4f {
+    return in.color;
+}
+
+@fragment
+fn sdf_circle_frag(in: SDFCircleOut) -> @location(0) vec4f {
+    // Draw an antialiased circle with a fading margin as a visual effect. `THRESHOLD` is the
+    // distance from the center of the circle to the edge where the fade begins.
+    let THRESHOLD = 0.6;
+    let d = saturate(length(in.quad_relative));
+    let alpha = select(1., 1. - smoothstep(THRESHOLD, 1., d), d > THRESHOLD);
+    return vec4(in.color.rgb, alpha);
+}
+"#;
diff --git a/vello/src/debug/validate.rs b/vello/src/debug/validate.rs
new file mode 100644
index 000000000..d3bd6e5d3
--- /dev/null
+++ b/vello/src/debug/validate.rs
@@ -0,0 +1,64 @@
+// Copyright 2023 the Vello Authors
+// SPDX-License-Identifier: Apache-2.0 OR MIT
+
+use {
+    bytemuck::{Pod, Zeroable},
+    std::{collections::BTreeSet, fmt},
+    vello_encoding::LineSoup,
+};
+
+#[derive(Copy, Clone, Default, PartialEq, Eq, PartialOrd, Ord, Pod, Zeroable)]
+#[repr(C)]
+pub struct LineEndpoint {
+    pub path_ix: u32,
+
+    // Coordinates in IEEE-754 32-bit float representation
+    // We use u32 here because we are comparing bit patterns rather than proximity, to evaluate exact watertightness
+    // To accelerate this, we use a BTreeSet, which don't support f32 values directly.
+    pub x: u32,
+    pub y: u32,
+}
+
+impl LineEndpoint {
+    pub fn new(line: &LineSoup, start_or_end: bool) -> Self {
+        let (x, y) = if start_or_end {
+            (line.p0[0], line.p0[1])
+        } else {
+            (line.p1[0], line.p1[1])
+        };
+        Self {
+            path_ix: line.path_ix,
+            x: x.to_bits(),
+            y: y.to_bits(),
+        }
+    }
+}
+
+impl fmt::Debug for LineEndpoint {
+    fn fmt(&self, f: &mut fmt::Formatter<'_>) -> fmt::Result {
+        f.debug_struct("Point")
+            .field("path_ix", &self.path_ix)
+            .field("x", &f32::from_bits(self.x))
+            .field("y", &f32::from_bits(self.y))
+            .finish()
+    }
+}
+
+pub(crate) fn validate_line_soup(lines: &[LineSoup]) -> Vec<LineEndpoint> {
+    let mut points = BTreeSet::new();
+    for line in lines {
+        let pts = [
+            LineEndpoint::new(line, true),
+            LineEndpoint::new(line, false),
+        ];
+        for p in pts {
+            if !points.remove(&p) {
+                points.insert(p);
+            }
+        }
+    }
+    if !points.is_empty() {
+        eprintln!("Unpaired points are present: {:#?}", points);
+    }
+    points.into_iter().collect()
+}
diff --git a/vello/src/lib.rs b/vello/src/lib.rs
index a8e22f53d..4aa072381 100644
--- a/vello/src/lib.rs
+++ b/vello/src/lib.rs
@@ -81,6 +81,7 @@
 //!
 //! See the [`examples/`](https://github.com/linebender/vello/tree/main/examples) folder to see how that code integrates with frameworks like winit.
 
+mod debug;
 mod recording;
 mod render;
 mod scene;
@@ -124,10 +125,11 @@ use vello_encoding::Resolver;
 #[cfg(feature = "wgpu")]
 use wgpu_engine::{ExternalResource, WgpuEngine};
 
+pub use debug::DebugLayers;
 /// Temporary export, used in `with_winit` for stats
 pub use vello_encoding::BumpAllocators;
 #[cfg(feature = "wgpu")]
-use wgpu::{Device, PipelineCompilationOptions, Queue, SurfaceTexture, TextureFormat, TextureView};
+use wgpu::{Device, Queue, SurfaceTexture, TextureFormat, TextureView};
 #[cfg(all(feature = "wgpu", feature = "wgpu-profiler"))]
 use wgpu_profiler::{GpuProfiler, GpuProfilerSettings};
 
@@ -212,6 +214,11 @@ pub enum Error {
     #[cfg(feature = "wgpu")]
     #[error("Failed to async map a buffer")]
     BufferAsyncError(#[from] wgpu::BufferAsyncError),
+    /// Failed to download an internal buffer for debug visualization.
+    #[cfg(feature = "wgpu")]
+    #[cfg(feature = "debug_layers")]
+    #[error("Failed to download internal buffer '{0}' for visualization")]
+    DownloadError(&'static str),
 
     #[cfg(feature = "wgpu")]
     #[error("wgpu Error from scope")]
@@ -241,6 +248,8 @@ pub struct Renderer {
     resolver: Resolver,
     shaders: FullShaders,
     blit: Option<BlitPipeline>,
+    #[cfg(feature = "debug_layers")]
+    debug: Option<debug::DebugRenderer>,
     target: Option<TargetTexture>,
     #[cfg(feature = "wgpu-profiler")]
     pub profiler: GpuProfiler,
@@ -268,6 +277,13 @@ pub struct RenderParams {
     /// The anti-aliasing algorithm. The selected algorithm must have been initialized while
     /// constructing the `Renderer`.
     pub antialiasing_method: AaConfig,
+
+    /// Options for debug layer rendering.
+    ///
+    /// This only has an effect when the `debug_layers` feature is enabled.
+    // This is exposed publicly as a least-effort to avoid changing the API when features change.
+    // We expect the API to change here in the near future.
+    pub debug: DebugLayers,
 }
 
 #[cfg(feature = "wgpu")]
@@ -296,6 +312,13 @@ pub struct RendererOptions {
     pub num_init_threads: Option<NonZeroUsize>,
 }
 
+#[cfg(feature = "wgpu")]
+struct RenderResult {
+    bump: Option<BumpAllocators>,
+    #[cfg(feature = "debug_layers")]
+    captured: Option<render::CapturedBuffers>,
+}
+
 #[cfg(feature = "wgpu")]
 impl Renderer {
     /// Creates a new renderer for the specified device.
@@ -311,7 +334,11 @@ impl Renderer {
         engine.build_shaders_if_needed(device, options.num_init_threads);
         let blit = options
             .surface_format
-            .map(|surface_format| BlitPipeline::new(device, surface_format));
+            .map(|surface_format| BlitPipeline::new(device, surface_format, &mut engine));
+        #[cfg(feature = "debug_layers")]
+        let debug = options
+            .surface_format
+            .map(|surface_format| debug::DebugRenderer::new(device, surface_format, &mut engine));
 
         Ok(Self {
             options,
@@ -319,6 +346,8 @@ impl Renderer {
             resolver: Resolver::new(),
             shaders,
             blit,
+            #[cfg(feature = "debug_layers")]
+            debug,
             target: None,
             // Use 3 pending frames
             #[cfg(feature = "wgpu-profiler")]
@@ -407,45 +436,39 @@ impl Renderer {
             .blit
             .as_ref()
             .expect("renderer should have configured surface_format to use on a surface");
-        let mut encoder =
-            device.create_command_encoder(&wgpu::CommandEncoderDescriptor { label: None });
-        {
-            let surface_view = surface
-                .texture
-                .create_view(&wgpu::TextureViewDescriptor::default());
-            let bind_group = device.create_bind_group(&wgpu::BindGroupDescriptor {
-                label: None,
-                layout: &blit.bind_layout,
-                entries: &[wgpu::BindGroupEntry {
-                    binding: 0,
-                    resource: wgpu::BindingResource::TextureView(&target.view),
-                }],
-            });
-            let mut render_pass = encoder.begin_render_pass(&wgpu::RenderPassDescriptor {
-                label: None,
-                color_attachments: &[Some(wgpu::RenderPassColorAttachment {
-                    view: &surface_view,
-                    resolve_target: None,
-                    ops: wgpu::Operations {
-                        load: wgpu::LoadOp::Clear(wgpu::Color::default()),
-                        store: wgpu::StoreOp::Store,
-                    },
-                })],
-                depth_stencil_attachment: None,
-                occlusion_query_set: None,
-                timestamp_writes: None,
-            });
+        let mut recording = Recording::default();
+        let target_proxy = ImageProxy::new(width, height, ImageFormat::from_wgpu(target.format));
+        let surface_proxy = ImageProxy::new(
+            width,
+            height,
+            ImageFormat::from_wgpu(surface.texture.format()),
+        );
+        recording.draw(recording::DrawParams {
+            shader_id: blit.0,
+            instance_count: 1,
+            vertex_count: 6,
+            vertex_buffer: None,
+            resources: vec![ResourceProxy::Image(target_proxy)],
+            target: surface_proxy,
+            clear_color: Some([0., 0., 0., 0.]),
+        });
+
+        let surface_view = surface
+            .texture
+            .create_view(&wgpu::TextureViewDescriptor::default());
+        let external_resources = [
+            ExternalResource::Image(target_proxy, &target.view),
+            ExternalResource::Image(surface_proxy, &surface_view),
+        ];
+        self.engine.run_recording(
+            device,
+            queue,
+            &recording,
+            &external_resources,
+            "blit (render_to_surface)",
             #[cfg(feature = "wgpu-profiler")]
-            let mut render_pass = self
-                .profiler
-                .scope("blit to surface", &mut render_pass, device);
-            render_pass.set_pipeline(&blit.pipeline);
-            render_pass.set_bind_group(0, &bind_group, &[]);
-            render_pass.draw(0..6, 0..1);
-        }
-        #[cfg(feature = "wgpu-profiler")]
-        self.profiler.resolve_queries(&mut encoder);
-        queue.submit(Some(encoder.finish()));
+            &mut self.profiler,
+        )?;
         self.target = Some(target);
         #[cfg(feature = "wgpu-profiler")]
         {
@@ -467,12 +490,26 @@ impl Renderer {
         let mut engine = WgpuEngine::new(self.options.use_cpu);
         // We choose not to initialise these shaders in parallel, to ensure the error scope works correctly
         let shaders = shaders::full_shaders(device, &mut engine, &self.options)?;
+        let blit = self
+            .options
+            .surface_format
+            .map(|surface_format| BlitPipeline::new(device, surface_format, &mut engine));
+        #[cfg(feature = "debug_layers")]
+        let debug = self
+            .options
+            .surface_format
+            .map(|format| debug::DebugRenderer::new(device, format, &mut engine));
         let error = device.pop_error_scope().await;
         if let Some(error) = error {
             return Err(error.into());
         }
         self.engine = engine;
         self.shaders = shaders;
+        self.blit = blit;
+        #[cfg(feature = "debug_layers")]
+        {
+            self.debug = debug;
+        }
         Ok(())
     }
 
@@ -495,10 +532,46 @@ impl Renderer {
         texture: &TextureView,
         params: &RenderParams,
     ) -> Result<Option<BumpAllocators>> {
+        let result = self
+            .render_to_texture_async_internal(device, queue, scene, texture, params)
+            .await?;
+        #[cfg(feature = "debug_layers")]
+        {
+            // TODO: it would be better to improve buffer ownership tracking so that it's not
+            // necessary to submit a whole new Recording to free the captured buffers.
+            if let Some(captured) = result.captured {
+                let mut recording = Recording::default();
+                // TODO: this sucks. better to release everything in a helper
+                self.engine.free_download(captured.lines);
+                captured.release_buffers(&mut recording);
+                self.engine.run_recording(
+                    device,
+                    queue,
+                    &recording,
+                    &[],
+                    "free memory",
+                    #[cfg(feature = "wgpu-profiler")]
+                    &mut self.profiler,
+                )?;
+            }
+        }
+        Ok(result.bump)
+    }
+
+    async fn render_to_texture_async_internal(
+        &mut self,
+        device: &Device,
+        queue: &Queue,
+        scene: &Scene,
+        texture: &TextureView,
+        params: &RenderParams,
+    ) -> Result<RenderResult> {
         let mut render = Render::new();
         let encoding = scene.encoding();
-        // TODO: turn this on; the download feature interacts with CPU dispatch
-        let robust = false;
+        // TODO: turn this on; the download feature interacts with CPU dispatch.
+        // Currently this is always enabled when the `debug_layers` setting is enabled as the bump
+        // counts are used for debug visualiation.
+        let robust = cfg!(feature = "debug_layers");
         let recording = render.render_encoding_coarse(
             encoding,
             &mut self.resolver,
@@ -508,6 +581,8 @@ impl Renderer {
         );
         let target = render.out_image();
         let bump_buf = render.bump_buf();
+        #[cfg(feature = "debug_layers")]
+        let captured = render.take_captured_buffers();
         self.engine.run_recording(
             device,
             queue,
@@ -543,7 +618,11 @@ impl Renderer {
             #[cfg(feature = "wgpu-profiler")]
             &mut self.profiler,
         )?;
-        Ok(bump)
+        Ok(RenderResult {
+            bump,
+            #[cfg(feature = "debug_layers")]
+            captured,
+        })
     }
 
     /// See [`Self::render_to_surface`]
@@ -566,53 +645,73 @@ impl Renderer {
         if target.width != width || target.height != height {
             target = TargetTexture::new(device, width, height);
         }
-        let bump = self
-            .render_to_texture_async(device, queue, scene, &target.view, params)
+        let result = self
+            .render_to_texture_async_internal(device, queue, scene, &target.view, params)
             .await?;
         let blit = self
             .blit
             .as_ref()
             .expect("renderer should have configured surface_format to use on a surface");
-        let mut encoder =
-            device.create_command_encoder(&wgpu::CommandEncoderDescriptor { label: None });
+        let mut recording = Recording::default();
+        let target_proxy = ImageProxy::new(width, height, ImageFormat::from_wgpu(target.format));
+        let surface_proxy = ImageProxy::new(
+            width,
+            height,
+            ImageFormat::from_wgpu(surface.texture.format()),
+        );
+        recording.draw(recording::DrawParams {
+            shader_id: blit.0,
+            instance_count: 1,
+            vertex_count: 6,
+            vertex_buffer: None,
+            resources: vec![ResourceProxy::Image(target_proxy)],
+            target: surface_proxy,
+            clear_color: Some([0., 0., 0., 0.]),
+        });
+
+        #[cfg(feature = "debug_layers")]
         {
-            let surface_view = surface
-                .texture
-                .create_view(&wgpu::TextureViewDescriptor::default());
-            let bind_group = device.create_bind_group(&wgpu::BindGroupDescriptor {
-                label: None,
-                layout: &blit.bind_layout,
-                entries: &[wgpu::BindGroupEntry {
-                    binding: 0,
-                    resource: wgpu::BindingResource::TextureView(&target.view),
-                }],
-            });
-            let mut render_pass = encoder.begin_render_pass(&wgpu::RenderPassDescriptor {
-                label: None,
-                color_attachments: &[Some(wgpu::RenderPassColorAttachment {
-                    view: &surface_view,
-                    resolve_target: None,
-                    ops: wgpu::Operations {
-                        load: wgpu::LoadOp::Clear(wgpu::Color::default()),
-                        store: wgpu::StoreOp::Store,
-                    },
-                })],
-                depth_stencil_attachment: None,
-                timestamp_writes: None,
-                occlusion_query_set: None,
-            });
-            #[cfg(feature = "wgpu-profiler")]
-            let mut render_pass = self
-                .profiler
-                .scope("blit to surface", &mut render_pass, device);
-            render_pass.set_pipeline(&blit.pipeline);
-            render_pass.set_bind_group(0, &bind_group, &[]);
-            render_pass.draw(0..6, 0..1);
+            if let Some(captured) = result.captured {
+                let debug = self
+                    .debug
+                    .as_ref()
+                    .expect("renderer should have configured surface_format to use on a surface");
+                let bump = result.bump.as_ref().unwrap();
+                // TODO: We could avoid this download if `DebugLayers::VALIDATION` is unset.
+                let downloads = DebugDownloads::map(&self.engine, &captured, bump).await?;
+                debug.render(
+                    &mut recording,
+                    surface_proxy,
+                    &captured,
+                    bump,
+                    params,
+                    &downloads,
+                );
+
+                // TODO: this sucks. better to release everything in a helper
+                // TODO: it would be much better to have a way to safely destroy a buffer.
+                self.engine.free_download(captured.lines);
+                captured.release_buffers(&mut recording);
+            }
         }
-        #[cfg(feature = "wgpu-profiler")]
-        self.profiler.resolve_queries(&mut encoder);
-        queue.submit(Some(encoder.finish()));
-        self.target = Some(target);
+
+        let surface_view = surface
+            .texture
+            .create_view(&wgpu::TextureViewDescriptor::default());
+        let external_resources = [
+            ExternalResource::Image(target_proxy, &target.view),
+            ExternalResource::Image(surface_proxy, &surface_view),
+        ];
+        self.engine.run_recording(
+            device,
+            queue,
+            &recording,
+            &external_resources,
+            "blit (render_to_surface_async)",
+            #[cfg(feature = "wgpu-profiler")]
+            &mut self.profiler,
+        )?;
+
         #[cfg(feature = "wgpu-profiler")]
         {
             self.profiler.end_frame().unwrap();
@@ -623,7 +722,9 @@ impl Renderer {
                 self.profile_result = Some(result);
             }
         }
-        Ok(bump)
+
+        self.target = Some(target);
+        Ok(result.bump)
     }
 }
 
@@ -632,11 +733,13 @@ struct TargetTexture {
     view: TextureView,
     width: u32,
     height: u32,
+    format: wgpu::TextureFormat,
 }
 
 #[cfg(feature = "wgpu")]
 impl TargetTexture {
     fn new(device: &Device, width: u32, height: u32) -> Self {
+        let format = wgpu::TextureFormat::Rgba8Unorm;
         let texture = device.create_texture(&wgpu::TextureDescriptor {
             label: None,
             size: wgpu::Extent3d {
@@ -648,7 +751,7 @@ impl TargetTexture {
             sample_count: 1,
             dimension: wgpu::TextureDimension::D2,
             usage: wgpu::TextureUsages::STORAGE_BINDING | wgpu::TextureUsages::TEXTURE_BINDING,
-            format: wgpu::TextureFormat::Rgba8Unorm,
+            format,
             view_formats: &[],
         });
         let view = texture.create_view(&wgpu::TextureViewDescriptor::default());
@@ -656,19 +759,17 @@ impl TargetTexture {
             view,
             width,
             height,
+            format,
         }
     }
 }
 
 #[cfg(feature = "wgpu")]
-struct BlitPipeline {
-    bind_layout: wgpu::BindGroupLayout,
-    pipeline: wgpu::RenderPipeline,
-}
+struct BlitPipeline(ShaderId);
 
 #[cfg(feature = "wgpu")]
 impl BlitPipeline {
-    fn new(device: &Device, format: TextureFormat) -> Self {
+    fn new(device: &Device, format: TextureFormat, engine: &mut WgpuEngine) -> Self {
         const SHADERS: &str = r#"
             @vertex
             fn vs_main(@builtin(vertex_index) ix: u32) -> @builtin(position) vec4<f32> {
@@ -698,69 +799,54 @@ impl BlitPipeline {
                 return vec4(rgba_sep.rgb * rgba_sep.a, rgba_sep.a);
             }
         "#;
-
-        let shader = device.create_shader_module(wgpu::ShaderModuleDescriptor {
+        let module = device.create_shader_module(wgpu::ShaderModuleDescriptor {
             label: Some("blit shaders"),
             source: wgpu::ShaderSource::Wgsl(SHADERS.into()),
         });
-        let bind_layout = device.create_bind_group_layout(&wgpu::BindGroupLayoutDescriptor {
-            label: None,
-            entries: &[wgpu::BindGroupLayoutEntry {
-                visibility: wgpu::ShaderStages::FRAGMENT,
-                binding: 0,
-                ty: wgpu::BindingType::Texture {
-                    sample_type: wgpu::TextureSampleType::Float { filterable: true },
-                    view_dimension: wgpu::TextureViewDimension::D2,
-                    multisampled: false,
-                },
-                count: None,
-            }],
-        });
-        let pipeline_layout = device.create_pipeline_layout(&wgpu::PipelineLayoutDescriptor {
-            label: None,
-            bind_group_layouts: &[&bind_layout],
-            push_constant_ranges: &[],
-        });
-        let pipeline = device.create_render_pipeline(&wgpu::RenderPipelineDescriptor {
-            label: None,
-            layout: Some(&pipeline_layout),
-            vertex: wgpu::VertexState {
-                module: &shader,
-                entry_point: "vs_main",
-                compilation_options: PipelineCompilationOptions::default(),
-                buffers: &[],
-            },
-            fragment: Some(wgpu::FragmentState {
-                module: &shader,
-                entry_point: "fs_main",
-                compilation_options: PipelineCompilationOptions::default(),
-                targets: &[Some(wgpu::ColorTargetState {
-                    format,
-                    blend: None,
-                    write_mask: wgpu::ColorWrites::ALL,
-                })],
-            }),
-            primitive: wgpu::PrimitiveState {
-                topology: wgpu::PrimitiveTopology::TriangleList,
-                strip_index_format: None,
-                front_face: wgpu::FrontFace::Ccw,
-                cull_mode: Some(wgpu::Face::Back),
-                polygon_mode: wgpu::PolygonMode::Fill,
-                unclipped_depth: false,
-                conservative: false,
-            },
-            depth_stencil: None,
-            multisample: wgpu::MultisampleState {
-                count: 1,
-                mask: !0,
-                alpha_to_coverage_enabled: false,
+        let shader_id = engine.add_render_shader(
+            device,
+            "blit",
+            &module,
+            "vs_main",
+            "fs_main",
+            wgpu::PrimitiveTopology::TriangleList,
+            wgpu::ColorTargetState {
+                format,
+                blend: None,
+                write_mask: wgpu::ColorWrites::ALL,
             },
-            multiview: None,
-            cache: None,
-        });
-        Self {
-            bind_layout,
-            pipeline,
-        }
+            None,
+            &[(
+                BindType::ImageRead(ImageFormat::from_wgpu(format)),
+                wgpu::ShaderStages::FRAGMENT,
+            )],
+        );
+        Self(shader_id)
+    }
+}
+
+#[cfg(all(feature = "debug_layers", feature = "wgpu"))]
+pub(crate) struct DebugDownloads<'a> {
+    pub lines: wgpu::BufferSlice<'a>,
+}
+
+#[cfg(all(feature = "debug_layers", feature = "wgpu"))]
+impl<'a> DebugDownloads<'a> {
+    pub async fn map(
+        engine: &'a WgpuEngine,
+        captured: &render::CapturedBuffers,
+        bump: &BumpAllocators,
+    ) -> Result<DebugDownloads<'a>> {
+        use vello_encoding::LineSoup;
+
+        let Some(lines_buf) = engine.get_download(captured.lines) else {
+            return Err(Error::DownloadError("linesoup"));
+        };
+
+        let lines = lines_buf.slice(..bump.lines as u64 * std::mem::size_of::<LineSoup>() as u64);
+        let (sender, receiver) = futures_intrusive::channel::shared::oneshot_channel();
+        lines.map_async(wgpu::MapMode::Read, move |v| sender.send(v).unwrap());
+        receiver.receive().await.expect("channel was closed")?;
+        Ok(Self { lines })
     }
 }
diff --git a/vello/src/recording.rs b/vello/src/recording.rs
index e3c984006..503d65cdb 100644
--- a/vello/src/recording.rs
+++ b/vello/src/recording.rs
@@ -52,6 +52,11 @@ pub struct ImageProxy {
 #[derive(Clone, Copy)]
 pub enum ResourceProxy {
     Buffer(BufferProxy),
+    BufferRange {
+        proxy: BufferProxy,
+        offset: u64,
+        size: u64,
+    },
     Image(ImageProxy),
 }
 
@@ -64,11 +69,6 @@ pub enum Command {
     /// Commands the data to be uploaded to the given image.
     UploadImage(ImageProxy, Vec<u8>),
     WriteImage(ImageProxy, [u32; 2], Image),
-    // Discussion question: third argument is vec of resources?
-    // Maybe use tricks to make more ergonomic?
-    // Alternative: provide bufs & images as separate sequences
-    Dispatch(ShaderId, (u32, u32, u32), Vec<ResourceProxy>),
-    DispatchIndirect(ShaderId, BufferProxy, u64, Vec<ResourceProxy>),
     Download(BufferProxy),
     /// Commands to clear the buffer from an offset on for a length of the given size.
     /// If the size is [None], it clears until the end.
@@ -77,6 +77,12 @@ pub enum Command {
     FreeBuffer(BufferProxy),
     /// Commands to free the image.
     FreeImage(ImageProxy),
+    // Discussion question: third argument is vec of resources?
+    // Maybe use tricks to make more ergonomic?
+    // Alternative: provide bufs & images as separate sequences
+    Dispatch(ShaderId, (u32, u32, u32), Vec<ResourceProxy>),
+    DispatchIndirect(ShaderId, BufferProxy, u64, Vec<ResourceProxy>),
+    Draw(DrawParams),
 }
 
 /// The type of resource that will be bound to a slot in a shader.
@@ -95,6 +101,16 @@ pub enum BindType {
     // TODO: Uniform, Sampler, maybe others
 }
 
+pub struct DrawParams {
+    pub shader_id: ShaderId,
+    pub instance_count: u32,
+    pub vertex_count: u32,
+    pub vertex_buffer: Option<BufferProxy>,
+    pub resources: Vec<ResourceProxy>,
+    pub target: ImageProxy,
+    pub clear_color: Option<[f32; 4]>,
+}
+
 impl Recording {
     /// Appends a [`Command`] to the back of the [`Recording`].
     pub fn push(&mut self, cmd: Command) {
@@ -167,6 +183,11 @@ impl Recording {
         self.push(Command::DispatchIndirect(shader, buf, offset, r));
     }
 
+    /// Issue a draw call
+    pub fn draw(&mut self, params: DrawParams) {
+        self.push(Command::Draw(params));
+    }
+
     /// Prepare a buffer for downloading.
     ///
     /// Currently this copies to a download buffer. The original buffer can be freed
@@ -194,6 +215,11 @@ impl Recording {
     pub fn free_resource(&mut self, resource: ResourceProxy) {
         match resource {
             ResourceProxy::Buffer(buf) => self.free_buffer(buf),
+            ResourceProxy::BufferRange {
+                proxy,
+                offset: _,
+                size: _,
+            } => self.free_buffer(proxy),
             ResourceProxy::Image(image) => self.free_image(image),
         }
     }
@@ -220,6 +246,15 @@ impl ImageFormat {
             Self::Bgra8 => wgpu::TextureFormat::Bgra8Unorm,
         }
     }
+
+    #[cfg(feature = "wgpu")]
+    pub fn from_wgpu(format: wgpu::TextureFormat) -> Self {
+        match format {
+            wgpu::TextureFormat::Rgba8Unorm => Self::Rgba8,
+            wgpu::TextureFormat::Bgra8Unorm => Self::Bgra8,
+            _ => unimplemented!(),
+        }
+    }
 }
 
 impl ImageProxy {
diff --git a/vello/src/render.rs b/vello/src/render.rs
index 8a81d9843..bbd1c8c57 100644
--- a/vello/src/render.rs
+++ b/vello/src/render.rs
@@ -4,6 +4,7 @@
 //! Take an encoded scene and create a graph to render it
 
 use std::mem::size_of;
+use std::sync::atomic::AtomicBool;
 
 use crate::recording::{BufferProxy, ImageFormat, ImageProxy, Recording, ResourceProxy};
 use crate::shaders::FullShaders;
@@ -19,6 +20,18 @@ pub struct Render {
     fine_wg_count: Option<WorkgroupSize>,
     fine_resources: Option<FineResources>,
     mask_buf: Option<ResourceProxy>,
+
+    #[cfg(feature = "debug_layers")]
+    captured_buffers: Option<CapturedBuffers>,
+}
+
+#[cfg(feature = "debug_layers")]
+impl Drop for Render {
+    fn drop(&mut self) {
+        if self.captured_buffers.is_some() {
+            unreachable!("Render captured buffers without freeing them");
+        }
+    }
 }
 
 /// Resources produced by pipeline, needed for fine rasterization.
@@ -37,6 +50,31 @@ struct FineResources {
     out_image: ImageProxy,
 }
 
+/// A collection of internal buffers that are used for debug visualization when the
+/// `debug_layers` feature is enabled. The contents of these buffers remain GPU resident
+/// and must be freed directly by the caller.
+///
+/// Some of these buffers are also scheduled for a download to allow their contents to be
+/// processed for CPU-side validation. These buffers are documented as such.
+#[cfg(feature = "debug_layers")]
+pub struct CapturedBuffers {
+    pub sizes: vello_encoding::BufferSizes,
+
+    /// Buffers that remain GPU-only
+    pub path_bboxes: BufferProxy,
+
+    /// Buffers scheduled for download
+    pub lines: BufferProxy,
+}
+
+#[cfg(feature = "debug_layers")]
+impl CapturedBuffers {
+    pub fn release_buffers(self, recording: &mut Recording) {
+        recording.free_buffer(self.path_bboxes);
+        recording.free_buffer(self.lines);
+    }
+}
+
 #[cfg(feature = "wgpu")]
 pub(crate) fn render_full(
     scene: &Scene,
@@ -77,6 +115,8 @@ impl Render {
             fine_wg_count: None,
             fine_resources: None,
             mask_buf: None,
+            #[cfg(feature = "debug_layers")]
+            captured_buffers: None,
         }
     }
 
@@ -95,6 +135,7 @@ impl Render {
         use vello_encoding::RenderConfig;
         let mut recording = Recording::default();
         let mut packed = vec![];
+
         let (layout, ramps, images) = resolver.resolve(encoding, &mut packed);
         let gradient_image = if ramps.height == 0 {
             ResourceProxy::new_image(1, 1, ImageFormat::Rgba8)
@@ -107,6 +148,15 @@ impl Render {
                 data,
             ))
         };
+        if cfg!(not(feature = "debug_layers")) && !params.debug.is_empty() {
+            static HAS_WARNED: AtomicBool = AtomicBool::new(false);
+            if !HAS_WARNED.swap(true, std::sync::atomic::Ordering::Release) {
+                log::warn!(
+                    "Requested debug layers {debug:?} but `debug_layers` feature is not enabled.",
+                    debug = params.debug
+                );
+            }
+        }
         let image_atlas = if images.images.is_empty() {
             ImageProxy::new(1, 1, ImageFormat::Rgba8)
         } else {
@@ -310,7 +360,6 @@ impl Render {
             ],
         );
         recording.free_resource(draw_monoid_buf);
-        recording.free_resource(path_bbox_buf);
         recording.free_resource(clip_bbox_buf);
         // Note: this only needs to be rounded up because of the workaround to store the tile_offset
         // in storage rather than workgroup memory.
@@ -396,7 +445,6 @@ impl Render {
         );
         recording.free_buffer(indirect_count_buf);
         recording.free_resource(seg_counts_buf);
-        recording.free_resource(lines_buf);
         recording.free_resource(scene_buf);
         recording.free_resource(draw_monoid_buf);
         recording.free_resource(bin_header_buf);
@@ -419,6 +467,30 @@ impl Render {
             recording.download(*bump_buf.as_buf().unwrap());
         }
         recording.free_resource(bump_buf);
+
+        #[cfg(feature = "debug_layers")]
+        {
+            if robust {
+                let path_bboxes = *path_bbox_buf.as_buf().unwrap();
+                let lines = *lines_buf.as_buf().unwrap();
+                recording.download(lines);
+
+                self.captured_buffers = Some(CapturedBuffers {
+                    sizes: cpu_config.buffer_sizes,
+                    path_bboxes,
+                    lines,
+                });
+            } else {
+                recording.free_resource(path_bbox_buf);
+                recording.free_resource(lines_buf);
+            }
+        }
+        #[cfg(not(feature = "debug_layers"))]
+        {
+            recording.free_resource(path_bbox_buf);
+            recording.free_resource(lines_buf);
+        }
+
         recording
     }
 
@@ -509,4 +581,9 @@ impl Render {
             .as_buf()
             .unwrap()
     }
+
+    #[cfg(feature = "debug_layers")]
+    pub fn take_captured_buffers(&mut self) -> Option<CapturedBuffers> {
+        self.captured_buffers.take()
+    }
 }
diff --git a/vello/src/shaders.rs b/vello/src/shaders.rs
index 41e13468e..bf34bad46 100644
--- a/vello/src/shaders.rs
+++ b/vello/src/shaders.rs
@@ -77,7 +77,7 @@ pub(crate) fn full_shaders(
                 .into();
             #[cfg(not(feature = "hot_reload"))]
             let source = shaders.$name.wgsl.code;
-            engine.add_shader(
+            engine.add_compute_shader(
                 device,
                 $label,
                 source,
diff --git a/vello/src/wgpu_engine.rs b/vello/src/wgpu_engine.rs
index 67cd92f56..42ab919db 100644
--- a/vello/src/wgpu_engine.rs
+++ b/vello/src/wgpu_engine.rs
@@ -11,8 +11,8 @@ use vello_shaders::cpu::CpuBinding;
 
 use wgpu::{
     BindGroup, BindGroupLayout, Buffer, BufferUsages, CommandEncoder, CommandEncoderDescriptor,
-    ComputePipeline, Device, PipelineCompilationOptions, Queue, Texture, TextureAspect,
-    TextureUsages, TextureView, TextureViewDimension,
+    ComputePipeline, Device, PipelineCompilationOptions, Queue, RenderPipeline, Texture,
+    TextureAspect, TextureUsages, TextureView, TextureViewDimension,
 };
 
 use crate::{
@@ -43,8 +43,13 @@ pub(crate) struct WgpuEngine {
     pub(crate) image_overrides: HashMap<u64, Arc<wgpu::ImageCopyTextureBase<Texture>>>,
 }
 
+enum PipelineState {
+    Compute(ComputePipeline),
+    Render(RenderPipeline),
+}
+
 struct WgpuShader {
-    pipeline: ComputePipeline,
+    pipeline: PipelineState,
     bind_group_layout: BindGroupLayout,
 }
 
@@ -235,7 +240,7 @@ impl WgpuEngine {
     ///
     /// Maybe should do template instantiation here? But shader compilation pipeline feels maybe
     /// a bit separate.
-    pub fn add_shader(
+    pub fn add_compute_shader(
         &mut self,
         device: &Device,
         label: &'static str,
@@ -271,54 +276,9 @@ impl WgpuEngine {
             }
         }
 
-        let entries = layout
-            .iter()
-            .enumerate()
-            .map(|(i, bind_type)| match bind_type {
-                BindType::Buffer | BindType::BufReadOnly => wgpu::BindGroupLayoutEntry {
-                    binding: i as u32,
-                    visibility: wgpu::ShaderStages::COMPUTE,
-                    ty: wgpu::BindingType::Buffer {
-                        ty: wgpu::BufferBindingType::Storage {
-                            read_only: *bind_type == BindType::BufReadOnly,
-                        },
-                        has_dynamic_offset: false,
-                        min_binding_size: None,
-                    },
-                    count: None,
-                },
-                BindType::Uniform => wgpu::BindGroupLayoutEntry {
-                    binding: i as u32,
-                    visibility: wgpu::ShaderStages::COMPUTE,
-                    ty: wgpu::BindingType::Buffer {
-                        ty: wgpu::BufferBindingType::Uniform,
-                        has_dynamic_offset: false,
-                        min_binding_size: None,
-                    },
-                    count: None,
-                },
-                BindType::Image(format) | BindType::ImageRead(format) => {
-                    wgpu::BindGroupLayoutEntry {
-                        binding: i as u32,
-                        visibility: wgpu::ShaderStages::COMPUTE,
-                        ty: if *bind_type == BindType::ImageRead(*format) {
-                            wgpu::BindingType::Texture {
-                                sample_type: wgpu::TextureSampleType::Float { filterable: true },
-                                view_dimension: wgpu::TextureViewDimension::D2,
-                                multisampled: false,
-                            }
-                        } else {
-                            wgpu::BindingType::StorageTexture {
-                                access: wgpu::StorageTextureAccess::WriteOnly,
-                                format: format.to_wgpu(),
-                                view_dimension: wgpu::TextureViewDimension::D2,
-                            }
-                        },
-                        count: None,
-                    }
-                }
-            })
-            .collect::<Vec<_>>();
+        let entries = Self::create_bind_group_layout_entries(
+            layout.iter().map(|b| (*b, wgpu::ShaderStages::COMPUTE)),
+        );
         #[cfg(not(target_arch = "wasm32"))]
         if let Some(uninit) = self.shaders_to_initialise.as_mut() {
             let id = add(Shader {
@@ -342,6 +302,73 @@ impl WgpuEngine {
         })
     }
 
+    #[allow(clippy::too_many_arguments)]
+    pub fn add_render_shader(
+        &mut self,
+        device: &Device,
+        label: &'static str,
+        module: &wgpu::ShaderModule,
+        vertex_main: &'static str,
+        fragment_main: &'static str,
+        topology: wgpu::PrimitiveTopology,
+        color_attachment: wgpu::ColorTargetState,
+        vertex_buffer: Option<wgpu::VertexBufferLayout>,
+        bind_layout: &[(BindType, wgpu::ShaderStages)],
+    ) -> ShaderId {
+        let entries = Self::create_bind_group_layout_entries(bind_layout.iter().copied());
+        let bind_group_layout = device.create_bind_group_layout(&wgpu::BindGroupLayoutDescriptor {
+            label: None,
+            entries: &entries,
+        });
+        let pipeline_layout = device.create_pipeline_layout(&wgpu::PipelineLayoutDescriptor {
+            label: None,
+            bind_group_layouts: &[&bind_group_layout],
+            push_constant_ranges: &[],
+        });
+        let pipeline = device.create_render_pipeline(&wgpu::RenderPipelineDescriptor {
+            label: Some(label),
+            layout: Some(&pipeline_layout),
+            vertex: wgpu::VertexState {
+                module,
+                entry_point: vertex_main,
+                buffers: vertex_buffer
+                    .as_ref()
+                    .map(core::slice::from_ref)
+                    .unwrap_or_default(),
+                compilation_options: PipelineCompilationOptions::default(),
+            },
+            fragment: Some(wgpu::FragmentState {
+                module,
+                entry_point: fragment_main,
+                targets: &[Some(color_attachment)],
+                compilation_options: PipelineCompilationOptions::default(),
+            }),
+            primitive: wgpu::PrimitiveState {
+                topology,
+                strip_index_format: None,
+                front_face: wgpu::FrontFace::Ccw,
+                cull_mode: Some(wgpu::Face::Back),
+                polygon_mode: wgpu::PolygonMode::Fill,
+                unclipped_depth: false,
+                conservative: false,
+            },
+            depth_stencil: None,
+            multisample: wgpu::MultisampleState::default(),
+            multiview: None,
+            cache: None,
+        });
+        let id = self.shaders.len();
+        self.shaders.push(Shader {
+            wgpu: Some(WgpuShader {
+                pipeline: PipelineState::Render(pipeline),
+                bind_group_layout,
+            }),
+            cpu: None,
+            label,
+        });
+        ShaderId(id)
+    }
+
     pub fn run_recording(
         &mut self,
         device: &Device,
@@ -365,8 +392,11 @@ impl WgpuEngine {
                     transient_map
                         .bufs
                         .insert(buf_proxy.id, TransientBuf::Cpu(bytes));
-                    let usage =
-                        BufferUsages::COPY_SRC | BufferUsages::COPY_DST | BufferUsages::STORAGE;
+                    // TODO: restrict VERTEX usage to "debug_layers" feature?
+                    let usage = BufferUsages::COPY_SRC
+                        | BufferUsages::COPY_DST
+                        | BufferUsages::STORAGE
+                        | BufferUsages::VERTEX;
                     let buf = self
                         .pool
                         .get_buf(buf_proxy.size, buf_proxy.name, usage, device);
@@ -523,7 +553,10 @@ impl WgpuEngine {
                             let query = profiler
                                 .begin_query(shader.label, &mut cpass, device)
                                 .with_parent(Some(&query));
-                            cpass.set_pipeline(&wgpu_shader.pipeline);
+                            let PipelineState::Compute(pipeline) = &wgpu_shader.pipeline else {
+                                panic!("cannot issue a dispatch with a render pipeline");
+                            };
+                            cpass.set_pipeline(pipeline);
                             cpass.set_bind_group(0, &bind_group, &[]);
                             cpass.dispatch_workgroups(x, y, z);
                             #[cfg(feature = "wgpu-profiler")]
@@ -570,7 +603,10 @@ impl WgpuEngine {
                             let query = profiler
                                 .begin_query(shader.label, &mut cpass, device)
                                 .with_parent(Some(&query));
-                            cpass.set_pipeline(&wgpu_shader.pipeline);
+                            let PipelineState::Compute(pipeline) = &wgpu_shader.pipeline else {
+                                panic!("cannot issue a dispatch with a render pipeline");
+                            };
+                            cpass.set_pipeline(pipeline);
                             cpass.set_bind_group(0, &bind_group, &[]);
                             let buf = self.bind_map.get_gpu_buf(proxy.id).ok_or(
                                 Error::UnavailableBufferUsed(proxy.name, "indirect dispatch"),
@@ -581,6 +617,68 @@ impl WgpuEngine {
                         }
                     }
                 }
+                Command::Draw(draw_params) => {
+                    let shader = &self.shaders[draw_params.shader_id.0];
+                    #[cfg(feature = "wgpu-profiler")]
+                    let label = shader.label;
+                    let ShaderKind::Wgpu(shader) = shader.select() else {
+                        panic!("a render pass does not have a CPU equivalent");
+                    };
+                    let bind_group = transient_map.create_bind_group(
+                        &mut self.bind_map,
+                        &mut self.pool,
+                        device,
+                        queue,
+                        &mut encoder,
+                        &shader.bind_group_layout,
+                        &draw_params.resources,
+                    );
+                    let render_target = transient_map
+                        .materialize_external_image_for_render_pass(&draw_params.target);
+                    let mut rpass = encoder.begin_render_pass(&wgpu::RenderPassDescriptor {
+                        label: None,
+                        color_attachments: &[Some(wgpu::RenderPassColorAttachment {
+                            view: render_target,
+                            resolve_target: None,
+                            ops: wgpu::Operations {
+                                load: match draw_params.clear_color {
+                                    Some(c) => wgpu::LoadOp::Clear(wgpu::Color {
+                                        r: c[0] as f64,
+                                        g: c[1] as f64,
+                                        b: c[2] as f64,
+                                        a: c[3] as f64,
+                                    }),
+                                    None => wgpu::LoadOp::Load,
+                                },
+                                store: wgpu::StoreOp::Store,
+                            },
+                        })],
+                        depth_stencil_attachment: None,
+                        occlusion_query_set: None,
+                        timestamp_writes: None,
+                    });
+                    #[cfg(feature = "wgpu-profiler")]
+                    let query = profiler
+                        .begin_query(label, &mut rpass, device)
+                        .with_parent(Some(&query));
+                    let PipelineState::Render(pipeline) = &shader.pipeline else {
+                        panic!("cannot issue a draw with a compute pipeline");
+                    };
+                    rpass.set_pipeline(pipeline);
+                    if let Some(proxy) = draw_params.vertex_buffer {
+                        // TODO: need a way to materialize a CPU initialized buffer. For now assume
+                        // buffer exists? Also, need to materialize this buffer with vertex usage
+                        let buf = self
+                            .bind_map
+                            .get_gpu_buf(proxy.id)
+                            .ok_or(Error::UnavailableBufferUsed(proxy.name, "draw"))?;
+                        rpass.set_vertex_buffer(0, buf.slice(..));
+                    }
+                    rpass.set_bind_group(0, &bind_group, &[]);
+                    rpass.draw(0..draw_params.vertex_count, 0..draw_params.instance_count);
+                    #[cfg(feature = "wgpu-profiler")]
+                    profiler.end_query(&mut rpass, query);
+                }
                 Command::Download(proxy) => {
                     let src_buf = self
                         .bind_map
@@ -617,6 +715,9 @@ impl WgpuEngine {
         }
         #[cfg(feature = "wgpu-profiler")]
         profiler.end_query(&mut encoder, query);
+        // TODO: This only actually needs to happen once per frame, but run_recording happens two or three times
+        #[cfg(feature = "wgpu-profiler")]
+        profiler.resolve_queries(&mut encoder);
         queue.submit(Some(encoder.finish()));
         for id in free_bufs {
             if let Some(buf) = self.bind_map.buf_map.remove(&id) {
@@ -649,6 +750,58 @@ impl WgpuEngine {
         self.downloads.remove(&buf.id);
     }
 
+    fn create_bind_group_layout_entries(
+        layout: impl Iterator<Item = (BindType, wgpu::ShaderStages)>,
+    ) -> Vec<wgpu::BindGroupLayoutEntry> {
+        layout
+            .enumerate()
+            .map(|(i, (bind_type, visibility))| match bind_type {
+                BindType::Buffer | BindType::BufReadOnly => wgpu::BindGroupLayoutEntry {
+                    binding: i as u32,
+                    visibility,
+                    ty: wgpu::BindingType::Buffer {
+                        ty: wgpu::BufferBindingType::Storage {
+                            read_only: bind_type == BindType::BufReadOnly,
+                        },
+                        has_dynamic_offset: false,
+                        min_binding_size: None,
+                    },
+                    count: None,
+                },
+                BindType::Uniform => wgpu::BindGroupLayoutEntry {
+                    binding: i as u32,
+                    visibility,
+                    ty: wgpu::BindingType::Buffer {
+                        ty: wgpu::BufferBindingType::Uniform,
+                        has_dynamic_offset: false,
+                        min_binding_size: None,
+                    },
+                    count: None,
+                },
+                BindType::Image(format) | BindType::ImageRead(format) => {
+                    wgpu::BindGroupLayoutEntry {
+                        binding: i as u32,
+                        visibility,
+                        ty: if bind_type == BindType::ImageRead(format) {
+                            wgpu::BindingType::Texture {
+                                sample_type: wgpu::TextureSampleType::Float { filterable: true },
+                                view_dimension: wgpu::TextureViewDimension::D2,
+                                multisampled: false,
+                            }
+                        } else {
+                            wgpu::BindingType::StorageTexture {
+                                access: wgpu::StorageTextureAccess::WriteOnly,
+                                format: format.to_wgpu(),
+                                view_dimension: wgpu::TextureViewDimension::D2,
+                            }
+                        },
+                        count: None,
+                    }
+                }
+            })
+            .collect::<Vec<_>>()
+    }
+
     fn create_compute_pipeline(
         device: &Device,
         label: &str,
@@ -682,7 +835,7 @@ impl WgpuEngine {
             cache: None,
         });
         WgpuShader {
-            pipeline,
+            pipeline: PipelineState::Compute(pipeline),
             bind_group_layout,
         }
     }
@@ -879,6 +1032,14 @@ impl<'a> TransientBindMap<'a> {
         }
     }
 
+    fn materialize_external_image_for_render_pass(&mut self, proxy: &ImageProxy) -> &TextureView {
+        // TODO: Maybe this should support instantiating a transient texture. Right now all render
+        // passes target a `SurfaceTexture`, so supporting external textures is sufficient.
+        self.images
+            .get(&proxy.id)
+            .expect("texture not materialized")
+    }
+
     #[allow(clippy::too_many_arguments)]
     fn create_bind_group(
         &mut self,
@@ -892,17 +1053,23 @@ impl<'a> TransientBindMap<'a> {
     ) -> BindGroup {
         for proxy in bindings {
             match proxy {
-                ResourceProxy::Buffer(proxy) => {
+                ResourceProxy::Buffer(proxy)
+                | ResourceProxy::BufferRange {
+                    proxy,
+                    offset: _,
+                    size: _,
+                } => {
                     if self.bufs.contains_key(&proxy.id) {
                         continue;
                     }
                     match bind_map.buf_map.entry(proxy.id) {
                         Entry::Vacant(v) => {
-                            // TODO: only some buffers will need indirect, but does it hurt?
+                            // TODO: only some buffers will need indirect & vertex, but does it hurt?
                             let usage = BufferUsages::COPY_SRC
                                 | BufferUsages::COPY_DST
                                 | BufferUsages::STORAGE
-                                | BufferUsages::INDIRECT;
+                                | BufferUsages::INDIRECT
+                                | BufferUsages::VERTEX;
                             let buf = pool.get_buf(proxy.size, proxy.name, usage, device);
                             if bind_map.pending_clears.remove(&proxy.id) {
                                 encoder.clear_buffer(&buf, 0, None);
@@ -966,6 +1133,24 @@ impl<'a> TransientBindMap<'a> {
                         resource: buf.as_entire_binding(),
                     }
                 }
+                ResourceProxy::BufferRange {
+                    proxy,
+                    offset,
+                    size,
+                } => {
+                    let buf = match self.bufs.get(&proxy.id) {
+                        Some(TransientBuf::Gpu(b)) => b,
+                        _ => bind_map.get_gpu_buf(proxy.id).unwrap(),
+                    };
+                    wgpu::BindGroupEntry {
+                        binding: i as u32,
+                        resource: wgpu::BindingResource::Buffer(wgpu::BufferBinding {
+                            buffer: buf,
+                            offset: *offset,
+                            size: core::num::NonZeroU64::new(*size),
+                        }),
+                    }
+                }
                 ResourceProxy::Image(proxy) => {
                     let view = self
                         .images
@@ -995,10 +1180,15 @@ impl<'a> TransientBindMap<'a> {
         // First pass is mutable; create buffers as needed
         for resource in bindings {
             match resource {
-                ResourceProxy::Buffer(buf) => match self.bufs.get(&buf.id) {
+                ResourceProxy::Buffer(proxy)
+                | ResourceProxy::BufferRange {
+                    proxy,
+                    offset: _,
+                    size: _,
+                } => match self.bufs.get(&proxy.id) {
                     Some(TransientBuf::Cpu(_)) => (),
                     Some(TransientBuf::Gpu(_)) => panic!("buffer was already materialized on GPU"),
-                    _ => bind_map.materialize_cpu_buf(buf),
+                    _ => bind_map.materialize_cpu_buf(proxy),
                 },
                 ResourceProxy::Image(_) => todo!(),
             };
@@ -1011,6 +1201,7 @@ impl<'a> TransientBindMap<'a> {
                     Some(TransientBuf::Cpu(b)) => CpuBinding::Buffer(b),
                     _ => bind_map.get_cpu_buf(buf.id),
                 },
+                ResourceProxy::BufferRange { .. } => todo!(),
                 ResourceProxy::Image(_) => todo!(),
             })
             .collect()
diff --git a/vello_tests/src/lib.rs b/vello_tests/src/lib.rs
index e2b6c7898..e97910593 100644
--- a/vello_tests/src/lib.rs
+++ b/vello_tests/src/lib.rs
@@ -98,6 +98,7 @@ pub async fn get_scene_image(params: &TestParams, scene: &Scene) -> Result<Image
         width,
         height,
         antialiasing_method: vello::AaConfig::Area,
+        debug: vello::DebugLayers::none(),
     };
     let size = Extent3d {
         width,