diff --git a/video/out/opengl/ra.h b/video/out/opengl/ra.h
index ae7fb9aea730a..1f716d98f8bdc 100644
--- a/video/out/opengl/ra.h
+++ b/video/out/opengl/ra.h
@@ -146,6 +146,7 @@ enum ra_buf_type {
     RA_BUF_TYPE_TEX_UPLOAD,     // texture upload buffer (pixel buffer object)
     RA_BUF_TYPE_SHADER_STORAGE, // shader buffer (SSBO), for RA_VARTYPE_BUF_RW
     RA_BUF_TYPE_UNIFORM,        // uniform buffer (UBO), for RA_VARTYPE_BUF_RO
+    RA_BUF_TYPE_VERTEX,         // not publicly usable (RA-internal usage)
 };
 
 struct ra_buf_params {
@@ -369,10 +370,10 @@ struct ra_fns {
 
     void (*buf_destroy)(struct ra *ra, struct ra_buf *buf);
 
-    // Update the contents of a buffer, starting at a given offset and up to a
-    // given size, with the contents of *data. This is an extremely common
-    // operation. Calling this while the buffer is considered "in use" is an
-    // error. (See: buf_poll)
+    // Update the contents of a buffer, starting at a given offset (*must* be a
+    // multiple of 4) and up to a given size, with the contents of *data. This
+    // is an extremely common operation. Calling this while the buffer is
+    // considered "in use" is an error. (See: buf_poll)
     void (*buf_update)(struct ra *ra, struct ra_buf *buf, ptrdiff_t offset,
                        const void *data, size_t size);
 
diff --git a/video/out/opengl/utils.c b/video/out/opengl/utils.c
index b8fc24a52e133..aeadd346b9408 100644
--- a/video/out/opengl/utils.c
+++ b/video/out/opengl/utils.c
@@ -64,7 +64,8 @@ static bool ra_buf_pool_grow(struct ra *ra, struct ra_buf_pool *pool)
         return false;
 
     MP_TARRAY_INSERT_AT(NULL, pool->buffers, pool->num_buffers, pool->index, buf);
-    MP_VERBOSE(ra, "Resized buffer pool to size %d\n", pool->num_buffers);
+    MP_VERBOSE(ra, "Resized buffer pool of type %u to size %d\n",
+               pool->current_params.type, pool->num_buffers);
     return true;
 }
 
diff --git a/video/out/vo.c b/video/out/vo.c
index f9c5d04e24be0..06507c7f87694 100644
--- a/video/out/vo.c
+++ b/video/out/vo.c
@@ -60,6 +60,7 @@ extern const struct vo_driver video_out_drm;
 extern const struct vo_driver video_out_direct3d;
 extern const struct vo_driver video_out_sdl;
 extern const struct vo_driver video_out_vaapi;
+extern const struct vo_driver video_out_vulkan;
 extern const struct vo_driver video_out_wayland;
 extern const struct vo_driver video_out_rpi;
 extern const struct vo_driver video_out_tct;
@@ -78,6 +79,9 @@ const struct vo_driver *const video_out_drivers[] =
 #if HAVE_DIRECT3D
     &video_out_direct3d,
 #endif
+#if HAVE_VULKAN
+    &video_out_vulkan,
+#endif
 #if HAVE_WAYLAND
     &video_out_wayland,
 #endif
diff --git a/video/out/vo_vulkan.c b/video/out/vo_vulkan.c
new file mode 100644
index 0000000000000..9e6c7984c6a3d
--- /dev/null
+++ b/video/out/vo_vulkan.c
@@ -0,0 +1,335 @@
+/*
+ * This file is part of mpv.
+ *
+ * mpv is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU Lesser General Public
+ * License as published by the Free Software Foundation; either
+ * version 2.1 of the License, or (at your option) any later version.
+ *
+ * mpv is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+ * GNU Lesser General Public License for more details.
+ *
+ * You should have received a copy of the GNU Lesser General Public
+ * License along with mpv.  If not, see <http://www.gnu.org/licenses/>.
+ */
+
+#include "mpv_talloc.h"
+#include "options/m_config.h"
+#include "osdep/timer.h"
+#include "video/mp_image.h"
+#include "video/out/x11_common.h"
+#include "vo.h"
+#include "sub/osd.h"
+
+#include "opengl/ra.h"
+#include "opengl/video.h"
+
+#include "vulkan/common.h"
+#include "vulkan/utils.h"
+#include "vulkan/ra_vk.h"
+
+struct vo_vulkan_opts {
+    int debug;    // whether to load the validation layers or not
+    int allow_sw; // whether to allow software devices
+    char *device; // force a specific GPU
+    int swsize;   // swapchain size
+    int swdepth;  // swapchain depth
+};
+
+struct vk_priv {
+    struct vo *vo;
+    struct mp_log *log;
+
+    struct vo_vulkan_opts opts;
+
+    struct mpvk_ctx vk;
+    struct ra *ra;
+    struct gl_video *renderer;
+
+    struct vk_swchain swchain;
+    int frames_in_flight;
+};
+
+static bool resize(struct vo *vo)
+{
+    struct vk_priv *p = vo->priv;
+
+    MP_VERBOSE(vo, "Resize: %dx%d\n", vo->dwidth, vo->dheight);
+
+    if (!vk_swchain_resize(&p->swchain, vo->dwidth, vo->dheight)) {
+        MP_ERR(vo, "Failed resizing swapchain!\n");
+        return false;
+    }
+
+    struct mp_rect src, dst;
+    struct mp_osd_res osd;
+    vo_get_src_dst_rects(vo, &src, &dst, &osd);
+
+    gl_video_resize(p->renderer, &src, &dst, &osd);
+
+    vo->want_redraw = true;
+    return true;
+}
+
+static int reconfig(struct vo *vo, struct mp_image_params *params)
+{
+    struct vk_priv *p = vo->priv;
+
+    if (vo->x11)
+        vo_x11_config_vo_window(vo);
+
+    if (!resize(vo))
+        return VO_ERROR;
+
+    gl_video_config(p->renderer, params);
+
+    return 0;
+}
+
+static void uninit(struct vo *vo)
+{
+    struct vk_priv *p = vo->priv;
+    struct mpvk_ctx *vk = &p->vk;
+
+    gl_video_uninit(p->renderer);
+
+    if (p->ra) {
+        vk_swchain_uninit(p->ra, &p->swchain);
+        p->ra->fns->destroy(p->ra);
+    }
+
+    // Clean up platform-specific windowing stuff. Do this first to prevent
+    // keeping around the window for long, then we can uninit the device etc.
+    // afterwards
+    if (vo->x11)
+        vo_x11_uninit(vo);
+
+    mpvk_uninit(vk);
+}
+
+static int preinit(struct vo *vo)
+{
+    struct vk_priv *p = vo->priv;
+    struct mpvk_ctx *vk = &p->vk;
+    p->vo = vo;
+    p->log = vk->log = vo->log;
+
+    if (!mpvk_instance_init(vk, p->opts.debug))
+        goto error;
+    if (!mpvk_surface_init(vo, vk))
+        goto error;
+    if (!mpvk_find_phys_device(vk, p->opts.device, p->opts.allow_sw))
+        goto error;
+    if (!mpvk_pick_surface_format(vk))
+        goto error;
+    if (!mpvk_device_init(vk))
+        goto error;
+    p->ra = ra_create_vk(vk, p->log);
+    if (!p->ra)
+        goto error;
+    if (!vk_swchain_init(vk, p->ra, p->opts.swsize, &p->swchain))
+        goto error;
+
+    p->renderer = gl_video_init(p->ra, vo->log, vo->global);
+    gl_video_set_osd_source(p->renderer, vo->osd);
+    gl_video_configure_queue(p->renderer, vo);
+
+    return 0;
+
+error:
+    uninit(vo);
+    return -1;
+}
+
+static int control(struct vo *vo, uint32_t request, void *data)
+{
+    struct vk_priv *p = vo->priv;
+
+    switch (request) {
+    case VOCTRL_SET_PANSCAN:
+        return resize(vo) ? VO_TRUE : VO_ERROR;
+    case VOCTRL_SET_EQUALIZER:
+        vo->want_redraw = true;
+        return VO_TRUE;
+    case VOCTRL_UPDATE_RENDER_OPTS: {
+        gl_video_update_options(p->renderer);
+        gl_video_configure_queue(p->renderer, p->vo);
+        p->vo->want_redraw = true;
+        return true;
+    }
+    case VOCTRL_RESET:
+        gl_video_reset(p->renderer);
+        return true;
+    case VOCTRL_PAUSE:
+        if (gl_video_showing_interpolated_frame(p->renderer))
+            vo->want_redraw = true;
+        return true;
+    case VOCTRL_PERFORMANCE_DATA:
+        gl_video_perfdata(p->renderer, (struct voctrl_performance_data *)data);
+        return true;
+    }
+
+    int events = 0, r = 0;
+
+    if (vo->x11)
+        r |= vo_x11_control(vo, &events, request, data);
+
+    if (events & VO_EVENT_RESIZE)
+        r |= resize(vo) ? 0 : VO_ERROR;
+
+    if (events & VO_EVENT_EXPOSE)
+        vo->want_redraw = true;
+
+    vo_event(vo, events);
+    return r;
+}
+
+static void draw_frame(struct vo *vo, struct vo_frame *frame)
+{
+    struct vk_priv *p = vo->priv;
+    struct vk_swimg swimg;
+    if (!vk_swchain_get(&p->swchain, &swimg))
+        goto error;
+
+    struct fbodst target = {
+        .tex = swimg.image,
+        .flip = false,
+    };
+
+    gl_video_render_frame(p->renderer, frame, target);
+    if (!ra_vk_present_frame(p->ra, &swimg, &p->frames_in_flight)) {
+        MP_ERR(vo, "Failed presenting frame!\n");
+        goto error;
+    }
+
+error:
+    return;
+}
+
+static void flip_page(struct vo *vo)
+{
+    struct vk_priv *p = vo->priv;
+    while (p->frames_in_flight >= p->opts.swdepth)
+        mpvk_poll_cmds(&p->vk, p->vk.pool, UINT64_MAX);
+}
+
+static int query_format(struct vo *vo, int format)
+{
+    struct vk_priv *p = vo->priv;
+    if (!gl_video_check_format(p->renderer, format))
+        return 0;
+    return 1;
+}
+
+static void wakeup(struct vo *vo)
+{
+    if (vo->x11)
+        vo_x11_wakeup(vo);
+}
+
+static void wait_events(struct vo *vo, int64_t until_time_us)
+{
+    if (vo->x11) {
+        vo_x11_wait_events(vo, until_time_us);
+    } else {
+        vo_wait_default(vo, until_time_us);
+    }
+}
+
+static struct mp_image *get_image(struct vo *vo, int imgfmt, int w, int h,
+                                  int stride_align)
+{
+    struct vk_priv *p = vo->priv;
+    return gl_video_get_image(p->renderer, imgfmt, w, h, stride_align);
+}
+
+static int vk_validate_dev(struct mp_log *log, const struct m_option *opt,
+                           struct bstr name, struct bstr param)
+{
+    int ret = M_OPT_INVALID;
+    VkResult res;
+
+    // Create a dummy instance to validate/list the devices
+    VkInstanceCreateInfo info = {
+        .sType = VK_STRUCTURE_TYPE_INSTANCE_CREATE_INFO,
+    };
+
+    VkInstance inst;
+    VkPhysicalDevice *devices = NULL;
+    uint32_t num = 0;
+
+    res = vkCreateInstance(&info, MPVK_ALLOCATOR, &inst);
+    if (res != VK_SUCCESS)
+        goto error;
+
+    res = vkEnumeratePhysicalDevices(inst, &num, NULL);
+    if (res != VK_SUCCESS)
+        goto error;
+
+    devices = talloc_array(NULL, VkPhysicalDevice, num);
+    vkEnumeratePhysicalDevices(inst, &num, devices);
+    if (res != VK_SUCCESS)
+        goto error;
+
+    bool help = bstr_equals0(param, "help");
+    if (help) {
+        mp_info(log, "Available vulkan devices:\n");
+        ret = M_OPT_EXIT;
+    }
+
+    for (int i = 0; i < num; i++) {
+        VkPhysicalDeviceProperties prop;
+        vkGetPhysicalDeviceProperties(devices[i], &prop);
+
+        if (help) {
+            mp_info(log, "  '%s' (GPU %d, ID %x:%x)\n", prop.deviceName, i,
+                    prop.vendorID, prop.deviceID);
+        } else if (bstr_equals0(param, prop.deviceName)) {
+            ret = 0;
+            break;
+        }
+    }
+
+    if (!help)
+        mp_err(log, "No device with name '%.*s'!\n", BSTR_P(param));
+
+error:
+    talloc_free(devices);
+    return ret;
+}
+
+#define OPT_BASE_STRUCT struct vk_priv
+
+const struct vo_driver video_out_vulkan = {
+    .description = "Vulkan Renderer",
+    .name = "vulkan",
+    .preinit = preinit,
+    .query_format = query_format,
+    .reconfig = reconfig,
+    .control = control,
+    .get_image = get_image,
+    .draw_frame = draw_frame,
+    .flip_page = flip_page,
+    .wait_events = wait_events,
+    .wakeup = wakeup,
+    .uninit = uninit,
+    .priv_size = sizeof(struct vk_priv),
+    .options = (const m_option_t[]) {
+        OPT_FLAG("vulkan-debug", opts.debug, 0),
+        OPT_FLAG("vulkan-sw", opts.allow_sw, 0),
+        OPT_STRING_VALIDATE("vulkan-device", opts.device, 0, vk_validate_dev),
+        OPT_INTRANGE("vulkan-swapchain-size", opts.swsize, 0, 1,
+                     MPVK_MAX_STREAMING_DEPTH),
+        OPT_INTRANGE("vulkan-swapchain-depth", opts.swdepth, 0, 1,
+                     MPVK_MAX_STREAMING_DEPTH),
+        {0}
+    },
+    .priv_defaults = &(const struct vk_priv) {
+        .opts = {
+            .swsize  = 8,
+            .swdepth = 1,
+        },
+    },
+};
diff --git a/video/out/vulkan/common.h b/video/out/vulkan/common.h
new file mode 100644
index 0000000000000..9113d27a6a201
--- /dev/null
+++ b/video/out/vulkan/common.h
@@ -0,0 +1,50 @@
+#pragma once
+
+#include <stdlib.h>
+#include <stdio.h>
+#include <stdint.h>
+#include <stdbool.h>
+#include <assert.h>
+
+#include "config.h"
+
+#include "common/common.h"
+#include "common/msg.h"
+
+// We need to define all platforms we want to support. Since we have
+// our own mechanism for checking this, we re-define the right symbols
+#if HAVE_VULKAN_XLIB
+#define VK_USE_PLATFORM_XLIB_KHR
+#endif
+
+#include <vulkan/vulkan.h>
+
+// Vulkan allows the optional use of a custom allocator. We don't need one but
+// mark this parameter with a better name in case we ever decide to change this
+// in the future. (And to make the code more readable)
+#define MPVK_ALLOCATOR NULL
+
+// A lot of things depend on streaming resources across frames. Depending on
+// how many frames we render ahead of time, we need to pick enough to avoid
+// any conflicts, so make all of these tunable relative to this constant in
+// order to centralize them.
+#define MPVK_MAX_STREAMING_DEPTH 8
+
+// Shared struct used to hold vulkan context information
+struct mpvk_ctx {
+    struct mp_log *log;
+    VkInstance inst;
+    VkPhysicalDevice physd;
+    VkDebugReportCallbackEXT dbg;
+    VkDevice dev;
+
+    // Surface, must be initialized fter the context itself
+    VkSurfaceKHR surf;
+    VkSurfaceFormatKHR surf_format; // picked at surface initialization time
+
+    struct vk_malloc *alloc; // memory allocator for this device
+    struct vk_cmdpool *pool; // command pool for this device
+
+    // Cached capabilities
+    VkPhysicalDeviceLimits limits;
+};
diff --git a/video/out/vulkan/formats.c b/video/out/vulkan/formats.c
new file mode 100644
index 0000000000000..b44bead99cc80
--- /dev/null
+++ b/video/out/vulkan/formats.c
@@ -0,0 +1,55 @@
+#include "formats.h"
+
+const struct vk_format vk_formats[] = {
+    // Regular, byte-aligned integer formats
+    {"r8",       VK_FORMAT_R8_UNORM,                  1,  1,   {8             }, RA_CTYPE_UNORM },
+    {"rg8",      VK_FORMAT_R8G8_UNORM,                2,  2,   {8,  8         }, RA_CTYPE_UNORM },
+    {"rgb8",     VK_FORMAT_R8G8B8_UNORM,              3,  3,   {8,  8,  8     }, RA_CTYPE_UNORM },
+    {"rgba8",    VK_FORMAT_R8G8B8A8_UNORM,            4,  4,   {8,  8,  8,  8 }, RA_CTYPE_UNORM },
+    {"r16",      VK_FORMAT_R16_UNORM,                 1,  2,   {16            }, RA_CTYPE_UNORM },
+    {"rg16",     VK_FORMAT_R16G16_UNORM,              2,  4,   {16, 16        }, RA_CTYPE_UNORM },
+    {"rgb16",    VK_FORMAT_R16G16B16_UNORM,           3,  6,   {16, 16, 16    }, RA_CTYPE_UNORM },
+    {"rgba16",   VK_FORMAT_R16G16B16A16_UNORM,        4,  8,   {16, 16, 16, 16}, RA_CTYPE_UNORM },
+
+    // Special, integer-only formats
+    {"r32ui",    VK_FORMAT_R32_UINT,                  1,  4,   {32            }, RA_CTYPE_UINT },
+    {"rg32ui",   VK_FORMAT_R32G32_UINT,               2,  8,   {32, 32        }, RA_CTYPE_UINT },
+    {"rgb32ui",  VK_FORMAT_R32G32B32_UINT,            3,  12,  {32, 32, 32    }, RA_CTYPE_UINT },
+    {"rgba32ui", VK_FORMAT_R32G32B32A32_UINT,         4,  16,  {32, 32, 32, 32}, RA_CTYPE_UINT },
+    {"r64ui",    VK_FORMAT_R64_UINT,                  1,  8,   {64            }, RA_CTYPE_UINT },
+    {"rg64ui",   VK_FORMAT_R64G64_UINT,               2,  16,  {64, 64        }, RA_CTYPE_UINT },
+    {"rgb64ui",  VK_FORMAT_R64G64B64_UINT,            3,  24,  {64, 64, 64    }, RA_CTYPE_UINT },
+    {"rgba64ui", VK_FORMAT_R64G64B64A64_UINT,         4,  32,  {64, 64, 64, 64}, RA_CTYPE_UINT },
+
+    // Packed integer formats
+    {"rg4",      VK_FORMAT_R4G4_UNORM_PACK8,          2,  1,   {4,  4         }, RA_CTYPE_UNORM },
+    {"rgba4",    VK_FORMAT_R4G4B4A4_UNORM_PACK16,     4,  2,   {4,  4,  4,  4 }, RA_CTYPE_UNORM },
+    {"rgb565",   VK_FORMAT_R5G6B5_UNORM_PACK16,       3,  2,   {5,  6,  5     }, RA_CTYPE_UNORM },
+    {"rgb565a1", VK_FORMAT_R5G5B5A1_UNORM_PACK16,     4,  2,   {5,  5,  5,  1 }, RA_CTYPE_UNORM },
+
+    // Float formats (native formats, hf = half float, df = double float)
+    {"r16hf",    VK_FORMAT_R16_SFLOAT,                1,  2,   {16            }, RA_CTYPE_FLOAT },
+    {"rg16hf",   VK_FORMAT_R16G16_SFLOAT,             2,  4,   {16, 16        }, RA_CTYPE_FLOAT },
+    {"rgb16hf",  VK_FORMAT_R16G16B16_SFLOAT,          3,  6,   {16, 16, 16    }, RA_CTYPE_FLOAT },
+    {"rgba16hf", VK_FORMAT_R16G16B16A16_SFLOAT,       4,  8,   {16, 16, 16, 16}, RA_CTYPE_FLOAT },
+    {"r32f",     VK_FORMAT_R32_SFLOAT,                1,  4,   {32            }, RA_CTYPE_FLOAT },
+    {"rg32f",    VK_FORMAT_R32G32_SFLOAT,             2,  8,   {32, 32        }, RA_CTYPE_FLOAT },
+    {"rgb32f",   VK_FORMAT_R32G32B32_SFLOAT,          3, 12,   {32, 32, 32    }, RA_CTYPE_FLOAT },
+    {"rgba32f",  VK_FORMAT_R32G32B32A32_SFLOAT,       4, 16,   {32, 32, 32, 32}, RA_CTYPE_FLOAT },
+    {"r64df",    VK_FORMAT_R64_SFLOAT,                1,  8,   {64            }, RA_CTYPE_FLOAT },
+    {"rg64df",   VK_FORMAT_R64G64_SFLOAT,             2, 16,   {64, 64        }, RA_CTYPE_FLOAT },
+    {"rgb64df",  VK_FORMAT_R64G64B64_SFLOAT,          3, 24,   {64, 64, 64    }, RA_CTYPE_FLOAT },
+    {"rgba64df", VK_FORMAT_R64G64B64A64_SFLOAT,       4, 32,   {64, 64, 64, 64}, RA_CTYPE_FLOAT },
+
+    // "Swapped" component order images
+    {"bgr8",     VK_FORMAT_B8G8R8_UNORM,              3,  3,   {8,  8,  8     }, RA_CTYPE_UNORM, true },
+    {"bgra8",    VK_FORMAT_B8G8R8A8_UNORM,            4,  4,   {8,  8,  8,  8 }, RA_CTYPE_UNORM, true },
+    {"bgra4",    VK_FORMAT_B4G4R4A4_UNORM_PACK16,     4,  2,   {4,  4,  4,  4 }, RA_CTYPE_UNORM, true },
+    {"bgr565",   VK_FORMAT_B5G6R5_UNORM_PACK16,       3,  2,   {5,  6,  5     }, RA_CTYPE_UNORM, true },
+    {"bgr565a1", VK_FORMAT_B5G5R5A1_UNORM_PACK16,     4,  2,   {5,  5,  5,  1 }, RA_CTYPE_UNORM, true },
+    {"a1rgb5",   VK_FORMAT_A1R5G5B5_UNORM_PACK16,     4,  2,   {1,  5,  5,  5 }, RA_CTYPE_UNORM, true },
+    {"a2rgb10",  VK_FORMAT_A2R10G10B10_UNORM_PACK32,  4,  4,   {2,  10, 10, 10}, RA_CTYPE_UNORM, true },
+    {"a2bgr10",  VK_FORMAT_A2B10G10R10_UNORM_PACK32,  4,  4,   {2,  10, 10, 10}, RA_CTYPE_UNORM, true },
+    {"abgr8",    VK_FORMAT_A8B8G8R8_UNORM_PACK32,     4,  4,   {8,  8,  8,  8 }, RA_CTYPE_UNORM, true },
+    {0}
+};
diff --git a/video/out/vulkan/formats.h b/video/out/vulkan/formats.h
new file mode 100644
index 0000000000000..e57275a153a12
--- /dev/null
+++ b/video/out/vulkan/formats.h
@@ -0,0 +1,16 @@
+#pragma once
+
+#include "video/out/opengl/ra.h"
+#include "common.h"
+
+struct vk_format {
+    const char *name;
+    VkFormat iformat;    // vulkan format enum
+    int components;      // how many components are there
+    int bytes;           // how many bytes is a texel
+    int bits[4];         // how many bits per component
+    enum ra_ctype ctype; // format representation type
+    bool fucked_order;   // used for formats which are not simply rgba
+};
+
+extern const struct vk_format vk_formats[];
diff --git a/video/out/vulkan/malloc.c b/video/out/vulkan/malloc.c
new file mode 100644
index 0000000000000..cdab6eb590e63
--- /dev/null
+++ b/video/out/vulkan/malloc.c
@@ -0,0 +1,315 @@
+#include "malloc.h"
+#include "utils.h"
+#include "osdep/timer.h"
+
+// Controls how much more space we will allocate than actually necessary.
+// Increasing this number increases the amount of memory used in total, but
+// decreases the frequency at which slabs need to be allocated and freed. A
+// value of 4 means the slabs will be allocated 4 times as large as they need
+// to be.
+#define MPVK_HEAP_SLAB_OVERCOMMIT 4
+
+// Controls the minimum slab size, to avoid overusing small slabs when
+// allocating many small slabs. (Default: 1 MB)
+#define MPVK_HEAP_MINIMUM_SLAB_SIZE (1 << 20)
+
+// A single slab represents a contiguous region of allocated memory. Actual
+// allocations are served as slices of this. Slabs are organized into linked
+// lists, which represent individual heaps.
+struct vk_slab {
+    struct vk_slab *next; // pointer to next vk_slab, or NULL
+    VkDeviceMemory mem;   // underlying device allocation
+    VkDeviceSize size;    // total size of `slab`
+    VkDeviceSize used;    // number of bytes actually in use (for GC accounting)
+    VkDeviceSize index;   // next free byte in `slab`
+    // optional, depends on the memory type:
+    VkBuffer buffer;      // buffer spanning the entire slab
+    void *data;           // mapped memory corresponding to `mem`
+};
+
+struct vk_heap {
+    VkBufferUsageFlagBits usage; // or 0 for generic heaps
+    struct vk_slab *tip;         // linked list of slabs that form this heap
+};
+
+// Represents a single memory type. All allocations of this memory type are
+// grouped together into heaps; one per buffer usage type and one for generic
+// allocations (e.g. images).
+struct vk_memtype {
+    int index;                      // the memory type index
+    int heapIndex;                  // the memory heap index
+    VkMemoryPropertyFlagBits flags; // the memory type bits
+    struct vk_heap generic_heap;    // the heap for generic allocations
+    // An array of heaps for each possible buffer type (grows dynamically):
+    // This is an array of sub-allocations, so we can resize the buf_heaps
+    // array without breaking the vk_heap pointers in memslice.priv.
+    struct vk_heap **buf_heaps;
+    int num_buf_heaps;
+};
+
+// The overall state of the allocator, which keeps track of a vk_heap for each
+// memory type supported by the device.
+struct vk_malloc {
+    struct vk_memtype types[VK_MAX_MEMORY_TYPES];
+    int num_types;
+};
+
+void vk_malloc_init(struct mpvk_ctx *vk)
+{
+    assert(vk->physd);
+
+    struct vk_malloc *ma = vk->alloc = talloc_zero(NULL, struct vk_malloc);
+
+    VkPhysicalDeviceMemoryProperties prop;
+    vkGetPhysicalDeviceMemoryProperties(vk->physd, &prop);
+
+    ma->num_types = prop.memoryTypeCount;
+    for (int i = 0; i < prop.memoryTypeCount; i++) {
+        ma->types[i] = (struct vk_memtype) {
+            .index = i,
+            .heapIndex = prop.memoryTypes[i].heapIndex,
+            .flags = prop.memoryTypes[i].propertyFlags,
+        };
+    }
+}
+
+// "Unlinks" a slab. The slab_ptr is updated to the next link in the chain,
+// or NULL if none left.
+static void slab_free(struct mpvk_ctx *vk, struct vk_slab **slab_ptr)
+{
+    struct vk_slab *slab = *slab_ptr;
+    if (!slab)
+        return;
+
+    assert(slab->used == 0);
+
+    int64_t start = mp_time_us();
+    vkDestroyBuffer(vk->dev, slab->buffer, MPVK_ALLOCATOR);
+    // also implicitly unmaps the memory if needed
+    vkFreeMemory(vk->dev, slab->mem, MPVK_ALLOCATOR);
+    int64_t stop = mp_time_us();
+
+    MP_VERBOSE(vk, "Freeing slab of size %lu took %ld μs.\n",
+               slab->size, stop - start);
+
+    *slab_ptr = slab->next;
+    talloc_free(slab);
+}
+
+static void heap_uninit(struct mpvk_ctx *vk, struct vk_heap *heap)
+{
+    while (heap->tip)
+        slab_free(vk, &heap->tip);
+}
+
+void vk_malloc_uninit(struct mpvk_ctx *vk)
+{
+    struct vk_malloc *ma = vk->alloc;
+    if (!ma)
+        return;
+
+    for (int i = 0; i < ma->num_types; i++) {
+        heap_uninit(vk, &ma->types[i].generic_heap);
+        for (int b = 0; b < ma->types[i].num_buf_heaps; b++) {
+            heap_uninit(vk, ma->types[i].buf_heaps[b]);
+            talloc_free(ma->types[i].buf_heaps[b]);
+        }
+        talloc_free(ma->types[i].buf_heaps);
+    }
+
+    talloc_free(vk->alloc);
+}
+
+// reqs: optional
+static struct vk_memtype *find_best_memtype(struct mpvk_ctx *vk,
+                                            VkMemoryPropertyFlagBits flags,
+                                            VkMemoryRequirements *reqs)
+{
+    struct vk_malloc *ma = vk->alloc;
+
+    // The vulkan spec requires memory types to be sorted in the "optimal"
+    // order, so the first matching type we find will be the best/fastest one.
+    for (int i = 0; i < ma->num_types; i++) {
+        // The memory type flags must include our properties
+        if ((ma->types[i].flags & flags) != flags)
+            continue;
+        // The memory type must be supported by the requirements (bitfield)
+        if (reqs && !(reqs->memoryTypeBits & (1 << i)))
+            continue;
+
+        return &ma->types[i];
+    }
+
+    MP_ERR(vk, "Found no memory type matching property flags 0x%x!\n", flags);
+    return NULL;
+}
+
+// Resizes a heap to make sure we have enough free bytes to serve an allocation
+static bool resize_heap(struct mpvk_ctx *vk, struct vk_memtype *type,
+                        struct vk_heap *heap, VkDeviceSize size,
+                        VkDeviceSize align)
+{
+    // If the tip already exists and is large enough, we can return right away
+    if (heap->tip) {
+        if (MP_ALIGN_UP(heap->tip->index, align) + size <= heap->tip->size)
+            return true;
+
+        // If the tip exists but is not large enough and has no other current
+        // allocations, free it right away to avoid accumulating garbage.
+        if (heap->tip->used == 0)
+            slab_free(vk, &heap->tip);
+    }
+
+    // Otherwise, allocate a new vk_slab and prepend it to the linked list
+    struct vk_slab *slab = talloc_ptrtype(NULL, slab);
+
+    VkDeviceSize minSize = MPMAX(MPVK_HEAP_MINIMUM_SLAB_SIZE,
+                                 MPVK_HEAP_SLAB_OVERCOMMIT * size);
+    *slab = (struct vk_slab) {
+        .next = heap->tip,
+        .size = heap->tip ? MPMAX(heap->tip->size, minSize) : minSize,
+    };
+
+    MP_VERBOSE(vk, "Allocating %lu memory of type 0x%x (id %d) in heap %d.\n",
+               slab->size, type->flags, type->index, type->heapIndex);
+
+    VkMemoryAllocateInfo minfo = {
+        .sType = VK_STRUCTURE_TYPE_MEMORY_ALLOCATE_INFO,
+        .memoryTypeIndex = type->index,
+        .allocationSize = slab->size,
+    };
+
+    if (heap->usage) {
+        VkBufferCreateInfo binfo = {
+            .sType = VK_STRUCTURE_TYPE_BUFFER_CREATE_INFO,
+            .size  = slab->size,
+            .usage = heap->usage,
+            .sharingMode = VK_SHARING_MODE_EXCLUSIVE,
+        };
+
+        VK(vkCreateBuffer(vk->dev, &binfo, MPVK_ALLOCATOR, &slab->buffer));
+
+        VkMemoryRequirements reqs;
+        vkGetBufferMemoryRequirements(vk->dev, slab->buffer, &reqs);
+        minfo.allocationSize = reqs.size; // this can be larger than slab->size
+
+        // Sanity check the memory requirements to make sure we didn't screw up
+        if (!(reqs.memoryTypeBits & (1 << type->index))) {
+            MP_ERR(vk, "Chosen memory type %d does not support buffer usage "
+                   "0x%x!\n", type->index, heap->usage);
+            goto error;
+        }
+    }
+
+    VK(vkAllocateMemory(vk->dev, &minfo, MPVK_ALLOCATOR, &slab->mem));
+
+    if (type->flags & VK_MEMORY_PROPERTY_HOST_VISIBLE_BIT)
+        VK(vkMapMemory(vk->dev, slab->mem, 0, VK_WHOLE_SIZE, 0, &slab->data));
+
+    if (heap->usage)
+        VK(vkBindBufferMemory(vk->dev, slab->buffer, slab->mem, 0));
+
+    heap->tip = slab;
+    return true;
+
+error:
+    slab_free(vk, &slab);
+    return false;
+}
+
+void vk_free_memslice(struct mpvk_ctx *vk, struct vk_memslice slice)
+{
+    struct vk_heap *heap = slice.priv;
+
+    // Find the slab containing this allocation, while also keeping track
+    // of the pointer to it (so we can unlink it from the list if needed)
+    struct vk_slab **slab_ptr = &heap->tip;
+    struct vk_slab *slab = *slab_ptr;
+    while (slab) {
+        if (slab->mem == slice.vkmem)
+            break;
+        slab_ptr = &slab->next;
+        slab = *slab_ptr;
+    }
+
+    assert(slab);
+    assert(slab->used >= slice.size);
+    slab->used -= slice.size;
+
+    MP_DBG(vk, "Freeing slice %lu + %lu from slab with size %lu\n",
+           slice.offset, slice.size, slab->size);
+
+    if (slab->used == 0 && slab != heap->tip)
+        slab_free(vk, slab_ptr);
+}
+
+static bool slice_heap(struct mpvk_ctx *vk, struct vk_memtype *type,
+                       struct vk_heap *heap, VkDeviceSize size,
+                       VkDeviceSize alignment, struct vk_memslice *out)
+{
+    if (!resize_heap(vk, type, heap, size, alignment))
+        return false;
+
+    struct vk_slab *tip = heap->tip;
+    assert(tip);
+    *out = (struct vk_memslice) {
+        .vkmem = tip->mem,
+        .offset = MP_ALIGN_UP(tip->index, alignment),
+        .size = size,
+        .priv = heap,
+    };
+
+    MP_DBG(vk, "Sub-allocating slice %lu + %lu from slab with size %lu\n",
+           out->offset, out->size, tip->size);
+
+    tip->index = out->offset + size;
+    tip->used += size;
+    return true;
+}
+
+bool vk_malloc_generic(struct mpvk_ctx *vk, VkMemoryRequirements reqs,
+                       VkMemoryPropertyFlagBits flags, struct vk_memslice *out)
+{
+    struct vk_memtype *type = find_best_memtype(vk, flags, &reqs);
+    if (!type)
+        return false;
+
+    struct vk_heap *heap = &type->generic_heap;
+    return slice_heap(vk, type, heap, reqs.size, reqs.alignment, out);
+}
+
+bool vk_malloc_buffer(struct mpvk_ctx *vk, VkBufferUsageFlagBits bufFlags,
+                      VkMemoryPropertyFlagBits memFlags, VkDeviceSize size,
+                      VkDeviceSize alignment, struct vk_bufslice *out)
+{
+    struct vk_memtype *type = find_best_memtype(vk, memFlags, NULL);
+    if (!type)
+        return false;
+
+    struct vk_heap *heap = NULL;
+    for (int i = 0; i < type->num_buf_heaps; i++) {
+        if (type->buf_heaps[i]->usage == bufFlags) {
+            heap = type->buf_heaps[i];
+            goto found;
+        }
+    }
+
+    // no buffer heap with this type => add it
+    MP_TARRAY_GROW(NULL, type->buf_heaps, type->num_buf_heaps + 1);
+    heap = type->buf_heaps[type->num_buf_heaps++] = talloc_ptrtype(NULL, heap);
+
+    *heap = (struct vk_heap) {
+        .usage = bufFlags,
+    };
+
+found:
+    if (!slice_heap(vk, type, heap, size, alignment, &out->mem))
+        return false;
+
+    struct vk_slab *tip = heap->tip;
+    out->buf = tip->buffer;
+    if (tip->data)
+        out->data = (void *)((uintptr_t)tip->data + (ptrdiff_t)out->mem.offset);
+
+    return true;
+}
diff --git a/video/out/vulkan/malloc.h b/video/out/vulkan/malloc.h
new file mode 100644
index 0000000000000..1963950d54f3a
--- /dev/null
+++ b/video/out/vulkan/malloc.h
@@ -0,0 +1,35 @@
+#pragma once
+
+#include "common.h"
+
+void vk_malloc_init(struct mpvk_ctx *vk);
+void vk_malloc_uninit(struct mpvk_ctx *vk);
+
+// Represents a single "slice" of generic (non-buffer) memory, plus some
+// metadata for accounting. This struct is essentially read-only.
+struct vk_memslice {
+    VkDeviceMemory vkmem;
+    VkDeviceSize offset;
+    VkDeviceSize size;
+    void *priv;
+};
+
+void vk_free_memslice(struct mpvk_ctx *vk, struct vk_memslice slice);
+bool vk_malloc_generic(struct mpvk_ctx *vk, VkMemoryRequirements reqs,
+                       VkMemoryPropertyFlagBits flags, struct vk_memslice *out);
+
+// Represents a single "slice" of a larger buffer
+struct vk_bufslice {
+    struct vk_memslice mem; // must be freed by the user when done
+    VkBuffer buf;           // the buffer this memory was sliced from
+    // For persistently mapped buffers, this points to the first usable byte of
+    // this slice.
+    void *data;
+};
+
+// Allocate a buffer slice. This is more efficient than vk_malloc_generic for
+// when the user needs lots of buffers, since it doesn't require
+// creating/destroying lots of (little) VkBuffers.
+bool vk_malloc_buffer(struct mpvk_ctx *vk, VkBufferUsageFlagBits bufFlags,
+                      VkMemoryPropertyFlagBits memFlags, VkDeviceSize size,
+                      VkDeviceSize alignment, struct vk_bufslice *out);
diff --git a/video/out/vulkan/ra_vk.c b/video/out/vulkan/ra_vk.c
new file mode 100644
index 0000000000000..853d868a861d6
--- /dev/null
+++ b/video/out/vulkan/ra_vk.c
@@ -0,0 +1,1588 @@
+#include "ra_vk.h"
+#include "malloc.h"
+#include "video/out/opengl/utils.h"
+
+// For ra.priv
+struct ra_vk {
+    struct mpvk_ctx *vk;
+    struct ra_tex *clear_tex; // stupid hack for clear()
+    // "Currently recording" command buffer
+    struct vk_cmd *active_cmd;
+};
+
+static struct mpvk_ctx *vk_get(struct ra *ra)
+{
+    struct ra_vk *p = ra->priv;
+    return p->vk;
+}
+
+static struct vk_cmd *vk_require_cmd(struct ra *ra)
+{
+    struct ra_vk *p = ra->priv;
+    struct mpvk_ctx *vk = vk_get(ra);
+    struct vk_cmdpool *pool = vk->pool;
+
+    if (p->active_cmd) {
+        assert(p->active_cmd->pool == pool);
+        return p->active_cmd;
+    }
+
+    struct vk_cmd *cmd = vk_cmd_begin(vk, pool);
+    return p->active_cmd = cmd;
+}
+
+// Note: This technically follows the flush() API, but we don't need
+// to expose that (and in fact, it's a bad idea) since we control flushing
+// behavior with ra_vk_present_frame already.
+static void vk_flush(struct ra *ra)
+{
+    struct ra_vk *p = ra->priv;
+    struct mpvk_ctx *vk = vk_get(ra);
+
+    if (!p->active_cmd)
+        return;
+
+    vk_cmd_submit(vk, p->active_cmd, NULL);
+    p->active_cmd = NULL;
+}
+
+// the callback's *priv will always be set to `ra`
+static void vk_callback(struct ra *ra, vk_cb callback, void *arg)
+{
+    struct ra_vk *p = ra->priv;
+    struct mpvk_ctx *vk = vk_get(ra);
+
+    if (p->active_cmd) {
+        vk_cmd_callback(p->active_cmd, callback, ra, arg);
+    } else {
+        vk_dev_callback(vk, callback, ra, arg);
+    }
+}
+
+#define MAKE_LAZY_DESTRUCTOR(fun, argtype)                  \
+    static void fun##_lazy(struct ra *ra, argtype *arg) {   \
+        vk_callback(ra, (vk_cb) fun, arg);                  \
+    }
+
+static void vk_destroy_ra(struct ra *ra)
+{
+    struct ra_vk *p = ra->priv;
+    struct mpvk_ctx *vk = vk_get(ra);
+
+    vk_flush(ra);
+    mpvk_wait_idle(vk);
+    ra_tex_free(ra, &p->clear_tex);
+
+    talloc_free(ra);
+}
+
+static bool vk_setup_formats(struct ra *ra)
+{
+    struct mpvk_ctx *vk = vk_get(ra);
+
+    for (const struct vk_format *vk_fmt = vk_formats; vk_fmt->name; vk_fmt++) {
+        VkFormatProperties prop;
+        vkGetPhysicalDeviceFormatProperties(vk->physd, vk_fmt->iformat, &prop);
+
+        // As a bare minimum, we need to sample from an allocated image
+        VkFormatFeatureFlags flags = prop.optimalTilingFeatures;
+        if (!(flags & VK_FORMAT_FEATURE_SAMPLED_IMAGE_BIT))
+            continue;
+
+        VkFormatFeatureFlags linear_bits, render_bits;
+        linear_bits = VK_FORMAT_FEATURE_SAMPLED_IMAGE_FILTER_LINEAR_BIT;
+        render_bits = VK_FORMAT_FEATURE_COLOR_ATTACHMENT_BIT |
+                      VK_FORMAT_FEATURE_COLOR_ATTACHMENT_BLEND_BIT;
+
+        struct ra_format *fmt = talloc_zero(ra, struct ra_format);
+        *fmt = (struct ra_format) {
+            .name            = vk_fmt->name,
+            .priv            = (void *)vk_fmt,
+            .ctype           = vk_fmt->ctype,
+            .ordered         = !vk_fmt->fucked_order,
+            .num_components  = vk_fmt->components,
+            .pixel_size      = vk_fmt->bytes,
+            .linear_filter   = !!(flags & linear_bits),
+            .renderable      = !!(flags & render_bits),
+        };
+
+        for (int i = 0; i < 4; i++)
+            fmt->component_size[i] = fmt->component_depth[i] = vk_fmt->bits[i];
+
+        MP_TARRAY_APPEND(ra, ra->formats, ra->num_formats, fmt);
+    }
+
+    // Populate some other capabilities related to formats while we're at it
+    VkImageType imgType[3] = {
+        VK_IMAGE_TYPE_1D,
+        VK_IMAGE_TYPE_2D,
+        VK_IMAGE_TYPE_3D
+    };
+
+    // R8_UNORM is supported on literally every single vulkan implementation
+    const VkFormat testfmt = VK_FORMAT_R8_UNORM;
+
+    for (int d = 0; d < 3; d++) {
+        VkImageFormatProperties iprop;
+        VkResult res = vkGetPhysicalDeviceImageFormatProperties(vk->physd,
+                testfmt, imgType[d], VK_IMAGE_TILING_OPTIMAL,
+                VK_IMAGE_USAGE_SAMPLED_BIT, 0, &iprop);
+
+        switch (imgType[d]) {
+        case VK_IMAGE_TYPE_1D:
+            if (res == VK_SUCCESS)
+                ra->caps |= RA_CAP_TEX_1D;
+            break;
+        case VK_IMAGE_TYPE_2D:
+            // 2D formats must be supported by RA, so ensure this is the case
+            VK_ASSERT(res, "Querying 2D format limits");
+            ra->max_texture_wh = MPMIN(iprop.maxExtent.width, iprop.maxExtent.height);
+            break;
+        case VK_IMAGE_TYPE_3D:
+            if (res == VK_SUCCESS)
+                ra->caps |= RA_CAP_TEX_3D;
+            break;
+        }
+    }
+
+    // RA_CAP_BLIT implies both blitting between images as well as blitting
+    // directly to the swapchain image, so check for all three operations
+    bool blittable = true;
+    VkFormatProperties prop;
+    vkGetPhysicalDeviceFormatProperties(vk->physd, testfmt, &prop);
+    if (!(prop.optimalTilingFeatures & VK_FORMAT_FEATURE_BLIT_SRC_BIT))
+        blittable = false;
+    if (!(prop.optimalTilingFeatures & VK_FORMAT_FEATURE_BLIT_DST_BIT))
+        blittable = false;
+
+    vkGetPhysicalDeviceFormatProperties(vk->physd, vk->surf_format.format, &prop);
+    if (!(prop.optimalTilingFeatures & VK_FORMAT_FEATURE_BLIT_DST_BIT))
+        blittable = false;
+
+    if (blittable)
+        ra->caps |= RA_CAP_BLIT;
+
+    return true;
+
+error:
+    return false;
+}
+
+static struct ra_fns ra_fns_vk;
+
+struct ra *ra_create_vk(struct mpvk_ctx *vk, struct mp_log *log)
+{
+    assert(vk->dev);
+    assert(vk->alloc);
+
+    struct ra *ra = talloc_zero(NULL, struct ra);
+    ra->log = log;
+    ra->fns = &ra_fns_vk;
+
+    struct ra_vk *p = ra->priv = talloc_zero(ra, struct ra_vk);
+    p->vk = vk;
+
+    // There's no way to query the supported GLSL version from VK_NV_glsl_shader
+    // (thanks nvidia), so just pick the GL version that modern nvidia devices
+    // support..
+    ra->glsl_version = 450;
+    ra->glsl_vulkan = true;
+    ra->max_shmem = vk->limits.maxComputeSharedMemorySize;
+    ra->caps = RA_CAP_NESTED_ARRAY;
+
+    if (vk->pool->props.queueFlags & VK_QUEUE_COMPUTE_BIT)
+        ra->caps |= RA_CAP_COMPUTE;
+
+    if (!vk_setup_formats(ra))
+        goto error;
+
+    // UBO support is required
+    ra->caps |= RA_CAP_BUF_RO;
+
+    // Try creating a shader storage buffer
+    struct ra_buf_params ssbo_params = {
+        .type = RA_BUF_TYPE_SHADER_STORAGE,
+        .size = 16,
+    };
+
+    struct ra_buf *ssbo = ra_buf_create(ra, &ssbo_params);
+    if (ssbo) {
+        ra->caps |= RA_CAP_BUF_RW;
+        ra_buf_free(ra, &ssbo);
+    }
+
+    // To support clear() by region, we need to allocate a dummy 1x1 image that
+    // will be used as the source of blit operations
+    struct ra_tex_params clear_params = {
+        .dimensions = 1, // no point in using a 2D image if height = 1
+        .w = 1,
+        .h = 1,
+        .d = 1,
+        .format = ra_find_float16_format(ra, 4),
+        .blit_src = 1,
+        .host_mutable = 1,
+    };
+
+    p->clear_tex = ra_tex_create(ra, &clear_params);
+    if (!p->clear_tex) {
+        MP_ERR(ra, "Failed creating 1x1 dummy texture for clear()!\n");
+        goto error;
+    }
+
+    return ra;
+
+error:
+    vk_destroy_ra(ra);
+    return NULL;
+}
+
+// Boilerplate wrapper around vkCreateRenderPass to ensure passes remain
+// compatible
+static VkResult vk_create_render_pass(VkDevice dev, const struct ra_format *fmt,
+                                      bool load_fbo, VkRenderPass *out)
+{
+    struct vk_format *vk_fmt = fmt->priv;
+    assert(fmt->renderable);
+
+    VkRenderPassCreateInfo rinfo = {
+        .sType = VK_STRUCTURE_TYPE_RENDER_PASS_CREATE_INFO,
+        .attachmentCount = 1,
+        .pAttachments = &(VkAttachmentDescription) {
+            .format = vk_fmt->iformat,
+            .samples = VK_SAMPLE_COUNT_1_BIT,
+            .loadOp = load_fbo ? VK_ATTACHMENT_LOAD_OP_LOAD
+                               : VK_ATTACHMENT_LOAD_OP_DONT_CARE,
+            .storeOp = VK_ATTACHMENT_STORE_OP_STORE,
+            .initialLayout = VK_IMAGE_LAYOUT_COLOR_ATTACHMENT_OPTIMAL,
+            .finalLayout = VK_IMAGE_LAYOUT_COLOR_ATTACHMENT_OPTIMAL,
+        },
+        .subpassCount = 1,
+        .pSubpasses = &(VkSubpassDescription) {
+            .pipelineBindPoint = VK_PIPELINE_BIND_POINT_GRAPHICS,
+            .colorAttachmentCount = 1,
+            .pColorAttachments = &(VkAttachmentReference) {
+                .attachment = 0,
+                .layout = VK_IMAGE_LAYOUT_COLOR_ATTACHMENT_OPTIMAL,
+            },
+        },
+    };
+
+    return vkCreateRenderPass(dev, &rinfo, MPVK_ALLOCATOR, out);
+}
+
+// For ra_tex.priv
+struct ra_tex_vk {
+    bool external_img;
+    VkImageType type;
+    VkImage img;
+    struct vk_memslice mem;
+    // for sampling
+    VkImageView view;
+    VkSampler sampler;
+    // for rendering
+    VkFramebuffer framebuffer;
+    VkRenderPass dummyPass;
+    // for uploading
+    struct ra_buf_pool pbo;
+    // "current" metadata, can change during the course of execution
+    VkImageLayout current_layout;
+    VkPipelineStageFlagBits current_stage;
+    VkAccessFlagBits current_access;
+};
+
+// Small helper to ease image barrier creation. if `discard` is set, the contents
+// of the image will be undefined after the barrier
+static void tex_barrier(struct vk_cmd *cmd, struct ra_tex_vk *tex_vk,
+                        VkPipelineStageFlagBits newStage,
+                        VkAccessFlagBits newAccess, VkImageLayout newLayout,
+                        bool discard)
+{
+    VkImageMemoryBarrier imgBarrier = {
+        .sType = VK_STRUCTURE_TYPE_IMAGE_MEMORY_BARRIER,
+        .oldLayout = tex_vk->current_layout,
+        .newLayout = newLayout,
+        .srcQueueFamilyIndex = VK_QUEUE_FAMILY_IGNORED,
+        .dstQueueFamilyIndex = VK_QUEUE_FAMILY_IGNORED,
+        .srcAccessMask = tex_vk->current_access,
+        .dstAccessMask = newAccess,
+        .image = tex_vk->img,
+        .subresourceRange = vk_range,
+    };
+
+    if (discard) {
+        imgBarrier.oldLayout = VK_IMAGE_LAYOUT_UNDEFINED;
+        imgBarrier.srcAccessMask = 0;
+    }
+
+    vkCmdPipelineBarrier(cmd->buf, tex_vk->current_stage, newStage, 0,
+                         0, NULL, 0, NULL, 1, &imgBarrier);
+
+    tex_vk->current_stage = newStage;
+    tex_vk->current_layout = newLayout;
+    tex_vk->current_access = newAccess;
+}
+
+static void vk_tex_destroy(struct ra *ra, struct ra_tex *tex)
+{
+    if (!tex)
+        return;
+
+    struct mpvk_ctx *vk = vk_get(ra);
+    struct ra_tex_vk *tex_vk = tex->priv;
+
+    ra_buf_pool_uninit(ra, &tex_vk->pbo);
+    vkDestroyFramebuffer(vk->dev, tex_vk->framebuffer, MPVK_ALLOCATOR);
+    vkDestroyRenderPass(vk->dev, tex_vk->dummyPass, MPVK_ALLOCATOR);
+    vkDestroySampler(vk->dev, tex_vk->sampler, MPVK_ALLOCATOR);
+    vkDestroyImageView(vk->dev, tex_vk->view, MPVK_ALLOCATOR);
+    if (!tex_vk->external_img) {
+        vkDestroyImage(vk->dev, tex_vk->img, MPVK_ALLOCATOR);
+        vk_free_memslice(vk, tex_vk->mem);
+    }
+
+    talloc_free(tex);
+}
+
+MAKE_LAZY_DESTRUCTOR(vk_tex_destroy, struct ra_tex);
+
+// Initializes non-VkImage values like the image view, samplers, etc.
+static bool vk_init_image(struct ra *ra, struct ra_tex *tex)
+{
+    struct mpvk_ctx *vk = vk_get(ra);
+
+    struct ra_tex_params *params = &tex->params;
+    struct ra_tex_vk *tex_vk = tex->priv;
+    assert(tex_vk->img);
+
+    tex_vk->current_layout = VK_IMAGE_LAYOUT_UNDEFINED;
+    tex_vk->current_stage = VK_PIPELINE_STAGE_TOP_OF_PIPE_BIT;
+    tex_vk->current_access = 0;
+
+    if (params->render_src || params->render_dst) {
+        static const VkImageViewType viewType[] = {
+            [VK_IMAGE_TYPE_1D] = VK_IMAGE_VIEW_TYPE_1D,
+            [VK_IMAGE_TYPE_2D] = VK_IMAGE_VIEW_TYPE_2D,
+            [VK_IMAGE_TYPE_3D] = VK_IMAGE_VIEW_TYPE_3D,
+        };
+
+        const struct vk_format *fmt = params->format->priv;
+        VkImageViewCreateInfo vinfo = {
+            .sType = VK_STRUCTURE_TYPE_IMAGE_VIEW_CREATE_INFO,
+            .image = tex_vk->img,
+            .viewType = viewType[tex_vk->type],
+            .format = fmt->iformat,
+            .subresourceRange = vk_range,
+        };
+
+        VK(vkCreateImageView(vk->dev, &vinfo, MPVK_ALLOCATOR, &tex_vk->view));
+    }
+
+    if (params->render_src) {
+        assert(params->format->linear_filter || !params->src_linear);
+        VkFilter filter = params->src_linear
+            ? VK_FILTER_LINEAR
+            : VK_FILTER_NEAREST;
+        VkSamplerAddressMode wrap = params->src_repeat
+            ? VK_SAMPLER_ADDRESS_MODE_REPEAT
+            : VK_SAMPLER_ADDRESS_MODE_CLAMP_TO_EDGE;
+        VkSamplerCreateInfo sinfo = {
+            .sType = VK_STRUCTURE_TYPE_SAMPLER_CREATE_INFO,
+            .magFilter = filter,
+            .minFilter = filter,
+            .addressModeU = wrap,
+            .addressModeV = wrap,
+            .addressModeW = wrap,
+            .maxAnisotropy = 1.0,
+        };
+
+        VK(vkCreateSampler(vk->dev, &sinfo, MPVK_ALLOCATOR, &tex_vk->sampler));
+    }
+
+    if (params->render_dst) {
+        // Framebuffers need to be created against a specific render pass
+        // layout, so we need to temporarily create a skeleton/dummy render
+        // pass for vulkan to figure out the compatibility
+        VK(vk_create_render_pass(vk->dev, params->format, false, &tex_vk->dummyPass));
+
+        VkFramebufferCreateInfo finfo = {
+            .sType = VK_STRUCTURE_TYPE_FRAMEBUFFER_CREATE_INFO,
+            .renderPass = tex_vk->dummyPass,
+            .attachmentCount = 1,
+            .pAttachments = &tex_vk->view,
+            .width = tex->params.w,
+            .height = tex->params.h,
+            .layers = 1,
+        };
+
+        VK(vkCreateFramebuffer(vk->dev, &finfo, MPVK_ALLOCATOR,
+                               &tex_vk->framebuffer));
+
+        // NOTE: Normally we would free the dummyPass again here, but a bug
+        // in the nvidia vulkan driver causes a segfault if you do.
+    }
+
+    return true;
+
+error:
+    return false;
+}
+
+static struct ra_tex *vk_tex_create(struct ra *ra,
+                                    const struct ra_tex_params *params)
+{
+    struct mpvk_ctx *vk = vk_get(ra);
+
+    struct ra_tex *tex = talloc_zero(NULL, struct ra_tex);
+    tex->params = *params;
+    tex->params.initial_data = NULL;
+
+    struct ra_tex_vk *tex_vk = tex->priv = talloc_zero(tex, struct ra_tex_vk);
+
+    const struct vk_format *fmt = params->format->priv;
+    switch (params->dimensions) {
+    case 1: tex_vk->type = VK_IMAGE_TYPE_1D; break;
+    case 2: tex_vk->type = VK_IMAGE_TYPE_2D; break;
+    case 3: tex_vk->type = VK_IMAGE_TYPE_3D; break;
+    default: abort();
+    }
+
+    VkImageUsageFlags usage = 0;
+    if (params->render_src)
+        usage |= VK_IMAGE_USAGE_SAMPLED_BIT;
+    if (params->render_dst)
+        usage |= VK_IMAGE_USAGE_COLOR_ATTACHMENT_BIT;
+    if (params->storage_dst)
+        usage |= VK_IMAGE_USAGE_STORAGE_BIT;
+    if (params->blit_src)
+        usage |= VK_IMAGE_USAGE_TRANSFER_SRC_BIT;
+    if (params->host_mutable || params->blit_dst || params->initial_data)
+        usage |= VK_IMAGE_USAGE_TRANSFER_DST_BIT;
+
+    // Double-check image usage support and fail immediately if invalid
+    VkImageFormatProperties iprop;
+    VkResult res = vkGetPhysicalDeviceImageFormatProperties(vk->physd,
+            fmt->iformat, tex_vk->type, VK_IMAGE_TILING_OPTIMAL, usage, 0,
+            &iprop);
+    if (res == VK_ERROR_FORMAT_NOT_SUPPORTED) {
+        return NULL;
+    } else {
+        VK_ASSERT(res, "Querying image format properties");
+    }
+
+    VkFormatProperties prop;
+    vkGetPhysicalDeviceFormatProperties(vk->physd, fmt->iformat, &prop);
+    VkFormatFeatureFlags flags = prop.optimalTilingFeatures;
+
+    bool has_blit_src   = flags & VK_FORMAT_FEATURE_BLIT_SRC_BIT,
+         has_src_linear = flags & VK_FORMAT_FEATURE_SAMPLED_IMAGE_FILTER_LINEAR_BIT;
+
+    if (params->w > iprop.maxExtent.width ||
+        params->h > iprop.maxExtent.height ||
+        params->d > iprop.maxExtent.depth ||
+        (params->blit_src && !has_blit_src) ||
+        (params->src_linear && !has_src_linear))
+    {
+        return NULL;
+    }
+
+    VkImageCreateInfo iinfo = {
+        .sType = VK_STRUCTURE_TYPE_IMAGE_CREATE_INFO,
+        .imageType = tex_vk->type,
+        .format = fmt->iformat,
+        .extent = (VkExtent3D) { params->w, params->h, params->d },
+        .mipLevels = 1,
+        .arrayLayers = 1,
+        .samples = VK_SAMPLE_COUNT_1_BIT,
+        .tiling = VK_IMAGE_TILING_OPTIMAL,
+        .usage = usage,
+        .sharingMode = VK_SHARING_MODE_EXCLUSIVE,
+        .initialLayout = VK_IMAGE_LAYOUT_UNDEFINED,
+    };
+
+    VK(vkCreateImage(vk->dev, &iinfo, MPVK_ALLOCATOR, &tex_vk->img));
+
+    VkMemoryPropertyFlagBits memFlags = VK_MEMORY_PROPERTY_DEVICE_LOCAL_BIT;
+    VkMemoryRequirements reqs;
+    vkGetImageMemoryRequirements(vk->dev, tex_vk->img, &reqs);
+
+    struct vk_memslice *mem = &tex_vk->mem;
+    if (!vk_malloc_generic(vk, reqs, memFlags, mem))
+        goto error;
+
+    VK(vkBindImageMemory(vk->dev, tex_vk->img, mem->vkmem, mem->offset));
+
+    if (!vk_init_image(ra, tex))
+        goto error;
+
+    if (params->initial_data) {
+        struct ra_tex_upload_params ul_params = {
+            .tex = tex,
+            .invalidate = true,
+            .src = params->initial_data,
+            .stride = params->w * fmt->bytes,
+        };
+        if (!ra->fns->tex_upload(ra, &ul_params))
+            goto error;
+    }
+
+    return tex;
+
+error:
+    vk_tex_destroy(ra, tex);
+    return NULL;
+}
+
+struct ra_tex *ra_vk_wrap_swchain_img(struct ra *ra, VkImage vkimg,
+                                      VkSwapchainCreateInfoKHR info)
+{
+    struct mpvk_ctx *vk = vk_get(ra);
+    struct ra_tex *tex = NULL;
+
+    const struct ra_format *format = NULL;
+    for (int i = 0; i < ra->num_formats; i++) {
+        const struct vk_format *fmt = ra->formats[i]->priv;
+        if (fmt->iformat == vk->surf_format.format) {
+            format = ra->formats[i];
+            break;
+        }
+    }
+
+    if (!format) {
+        MP_ERR(ra, "Could not find ra_format suitable for wrapped swchain image "
+                   "with surface format %d\n", vk->surf_format.format);
+        goto error;
+    }
+
+    tex = talloc_zero(NULL, struct ra_tex);
+    tex->params = (struct ra_tex_params) {
+        .format = format,
+        .dimensions = 2,
+        .w = info.imageExtent.width,
+        .h = info.imageExtent.height,
+        .d = 1,
+        .blit_src   = !!(info.imageUsage & VK_IMAGE_USAGE_TRANSFER_SRC_BIT),
+        .blit_dst   = !!(info.imageUsage & VK_IMAGE_USAGE_TRANSFER_DST_BIT),
+        .render_src = !!(info.imageUsage & VK_IMAGE_USAGE_SAMPLED_BIT),
+        .render_dst = !!(info.imageUsage & VK_IMAGE_USAGE_COLOR_ATTACHMENT_BIT),
+    };
+
+    struct ra_tex_vk *tex_vk = tex->priv = talloc_zero(tex, struct ra_tex_vk);
+    tex_vk->type = VK_IMAGE_TYPE_2D;
+    tex_vk->external_img = true;
+    tex_vk->img = vkimg;
+
+    if (!vk_init_image(ra, tex))
+        goto error;
+
+    return tex;
+
+error:
+    vk_tex_destroy(ra, tex);
+    return NULL;
+}
+
+// For ra_buf.priv
+struct ra_buf_vk {
+    struct vk_bufslice slice;
+    bool inuse;
+    bool needsflush;
+    // "current" metadata, can change during course of execution
+    VkPipelineStageFlagBits current_stage;
+    VkAccessFlagBits current_access;
+};
+
+static void buf_free_to_use(void *priv, struct ra_buf_vk *buf_vk)
+{
+    buf_vk->inuse = false;
+}
+
+static void buf_barrier(struct vk_cmd *cmd, struct ra_buf *buf,
+                        VkPipelineStageFlagBits newStage,
+                        VkAccessFlagBits newAccess, int offset, size_t size)
+{
+    struct ra_buf_vk *buf_vk = buf->priv;
+
+    VkBufferMemoryBarrier buffBarrier = {
+        .sType = VK_STRUCTURE_TYPE_BUFFER_MEMORY_BARRIER,
+        .srcAccessMask = buf_vk->current_access,
+        .dstAccessMask = newAccess,
+        .buffer = buf_vk->slice.buf,
+        .offset = offset,
+        .size = size,
+    };
+
+    if (buf_vk->needsflush || buf->params.host_mapped) {
+        buffBarrier.srcAccessMask = VK_ACCESS_HOST_WRITE_BIT;
+        buf_vk->current_stage = VK_PIPELINE_STAGE_HOST_BIT;
+        buf_vk->needsflush = false;
+    }
+
+    vkCmdPipelineBarrier(cmd->buf, buf_vk->current_stage, newStage, 0,
+                         0, NULL, 1, &buffBarrier, 0, NULL);
+
+    buf_vk->current_stage = newStage;
+    buf_vk->current_access = newAccess;
+    buf_vk->inuse = true;
+
+    vk_cmd_callback(cmd, (vk_cb) buf_free_to_use, NULL, buf_vk);
+}
+
+static void vk_buf_destroy(struct ra *ra, struct ra_buf *buf)
+{
+    if (!buf)
+        return;
+
+    struct mpvk_ctx *vk = vk_get(ra);
+    struct ra_buf_vk *buf_vk = buf->priv;
+
+    if (buf_vk->slice.buf)
+        vk_free_memslice(vk, buf_vk->slice.mem);
+
+    talloc_free(buf);
+}
+
+MAKE_LAZY_DESTRUCTOR(vk_buf_destroy, struct ra_buf);
+
+static void vk_buf_update(struct ra *ra, struct ra_buf *buf, ptrdiff_t offset,
+                          const void *data, size_t size)
+{
+    assert(buf->params.host_mutable || buf->params.initial_data);
+    struct ra_buf_vk *buf_vk = buf->priv;
+
+    // For host-mapped buffers, we can just directly memcpy the buffer contents.
+    // Otherwise, we can update the buffer from the GPU using a command buffer.
+    if (buf_vk->slice.data) {
+        assert(offset + size <= buf->params.size);
+        uintptr_t addr = (uintptr_t)buf_vk->slice.data + offset;
+        memcpy((void *)addr, data, size);
+        buf_vk->needsflush = true;
+    } else {
+        struct vk_cmd *cmd = vk_require_cmd(ra);
+        if (!cmd) {
+            MP_ERR(ra, "Failed updating buffer!\n");
+            return;
+        }
+
+        VkDeviceSize bufOffset = buf_vk->slice.mem.offset + offset;
+        assert(bufOffset == MP_ALIGN_UP(bufOffset, 4));
+        vkCmdUpdateBuffer(cmd->buf, buf_vk->slice.buf, bufOffset, size, data);
+    }
+}
+
+static struct ra_buf *vk_buf_create(struct ra *ra,
+                                    const struct ra_buf_params *params)
+{
+    struct mpvk_ctx *vk = vk_get(ra);
+
+    struct ra_buf *buf = talloc_zero(NULL, struct ra_buf);
+    buf->params = *params;
+
+    struct ra_buf_vk *buf_vk = buf->priv = talloc_zero(buf, struct ra_buf_vk);
+    buf_vk->current_stage = VK_PIPELINE_STAGE_TOP_OF_PIPE_BIT;
+    buf_vk->current_access = 0;
+
+    VkBufferUsageFlagBits bufFlags = 0;
+    VkMemoryPropertyFlagBits memFlags = 0;
+    VkDeviceSize align = 4; // alignment 4 is needed for buf_update
+
+    switch (params->type) {
+    case RA_BUF_TYPE_TEX_UPLOAD:
+        bufFlags |= VK_BUFFER_USAGE_TRANSFER_SRC_BIT;
+        memFlags |= VK_MEMORY_PROPERTY_HOST_VISIBLE_BIT;
+        break;
+    case RA_BUF_TYPE_UNIFORM:
+        bufFlags |= VK_BUFFER_USAGE_UNIFORM_BUFFER_BIT;
+        memFlags |= VK_MEMORY_PROPERTY_DEVICE_LOCAL_BIT;
+        align = MP_ALIGN_UP(align, vk->limits.minUniformBufferOffsetAlignment);
+        break;
+    case RA_BUF_TYPE_SHADER_STORAGE:
+        bufFlags |= VK_BUFFER_USAGE_STORAGE_BUFFER_BIT;
+        memFlags |= VK_MEMORY_PROPERTY_DEVICE_LOCAL_BIT;
+        align = MP_ALIGN_UP(align, vk->limits.minStorageBufferOffsetAlignment);
+        break;
+    case RA_BUF_TYPE_VERTEX:
+        bufFlags |= VK_BUFFER_USAGE_VERTEX_BUFFER_BIT;
+        memFlags |= VK_MEMORY_PROPERTY_DEVICE_LOCAL_BIT;
+        break;
+    default: abort();
+    }
+
+    if (params->host_mutable || params->initial_data) {
+        bufFlags |= VK_BUFFER_USAGE_TRANSFER_DST_BIT;
+        align = MP_ALIGN_UP(align, vk->limits.optimalBufferCopyOffsetAlignment);
+    }
+
+    if (params->host_mapped) {
+        memFlags |= VK_MEMORY_PROPERTY_HOST_VISIBLE_BIT |
+                    VK_MEMORY_PROPERTY_HOST_COHERENT_BIT |
+                    VK_MEMORY_PROPERTY_HOST_CACHED_BIT;
+    }
+
+    if (!vk_malloc_buffer(vk, bufFlags, memFlags, params->size, align,
+                          &buf_vk->slice))
+    {
+        goto error;
+    }
+
+    if (params->host_mapped)
+        buf->data = buf_vk->slice.data;
+
+    if (params->initial_data)
+        vk_buf_update(ra, buf, 0, params->initial_data, params->size);
+
+    buf->params.initial_data = NULL; // do this after vk_buf_update
+    return buf;
+
+error:
+    vk_buf_destroy(ra, buf);
+    return NULL;
+}
+
+static bool vk_buf_poll(struct ra *ra, struct ra_buf *buf)
+{
+    struct ra_buf_vk *buf_vk = buf->priv;
+    return !buf_vk->inuse;
+}
+
+static bool vk_tex_upload(struct ra *ra,
+                          const struct ra_tex_upload_params *params)
+{
+
+    struct ra_tex *tex = params->tex;
+    struct ra_tex_vk *tex_vk = tex->priv;
+
+    if (!params->buf)
+        return ra_tex_upload_pbo(ra, &tex_vk->pbo, params);
+
+    assert(!params->src);
+    assert(params->buf);
+    struct ra_buf *buf = params->buf;
+    struct ra_buf_vk *buf_vk = buf->priv;
+
+    VkBufferImageCopy region = {
+        .bufferOffset = buf_vk->slice.mem.offset + params->buf_offset,
+        .bufferRowLength = tex->params.w,
+        .bufferImageHeight = tex->params.h,
+        .imageSubresource = vk_layers,
+        .imageExtent = (VkExtent3D){tex->params.w, tex->params.h, tex->params.d},
+    };
+
+    if (tex->params.dimensions == 2) {
+        int pix_size = tex->params.format->pixel_size;
+        region.bufferRowLength = params->stride / pix_size;
+        if (region.bufferRowLength * pix_size != params->stride) {
+            MP_ERR(ra, "Texture upload strides must be a multiple of the texel "
+                       "size!\n");
+            goto error;
+        }
+
+        if (params->rc) {
+            struct mp_rect *rc = params->rc;
+            region.imageOffset = (VkOffset3D){rc->x0, rc->y0, 0};
+            region.imageExtent = (VkExtent3D){mp_rect_w(*rc), mp_rect_h(*rc), 1};
+        }
+    }
+
+    uint64_t size = region.bufferRowLength * region.bufferImageHeight *
+                    region.imageExtent.depth;
+
+    struct vk_cmd *cmd = vk_require_cmd(ra);
+    if (!cmd)
+        goto error;
+
+    buf_barrier(cmd, buf, VK_PIPELINE_STAGE_TRANSFER_BIT,
+                VK_ACCESS_TRANSFER_READ_BIT, region.bufferOffset, size);
+
+    tex_barrier(cmd, tex_vk, VK_PIPELINE_STAGE_TRANSFER_BIT,
+                VK_ACCESS_TRANSFER_WRITE_BIT,
+                VK_IMAGE_LAYOUT_TRANSFER_DST_OPTIMAL,
+                params->invalidate);
+
+    vkCmdCopyBufferToImage(cmd->buf, buf_vk->slice.buf, tex_vk->img,
+                           tex_vk->current_layout, 1, &region);
+
+    return true;
+
+error:
+    return false;
+}
+
+#define MPVK_NUM_DS MPVK_MAX_STREAMING_DEPTH
+
+// For ra_renderpass.priv
+struct ra_renderpass_vk {
+    // Compiled shaders
+    VkShaderModule vert;
+    VkShaderModule frag;
+    VkShaderModule comp;
+    // Pipeline / render pass
+    VkPipeline pipe;
+    VkPipelineLayout pipeLayout;
+    VkPipelineCache pipeCache;
+    VkRenderPass renderPass;
+    // Descriptor set (bindings)
+    VkDescriptorSetLayout dsLayout;
+    VkDescriptorPool dsPool;
+    VkDescriptorSet dss[MPVK_NUM_DS];
+    int dindex;
+    // Vertex buffers (vertices)
+    struct ra_buf_pool vbo;
+
+    // For updating
+    VkWriteDescriptorSet *dswrite;
+    VkDescriptorImageInfo *dsiinfo;
+    VkDescriptorBufferInfo *dsbinfo;
+};
+
+static void vk_renderpass_destroy(struct ra *ra, struct ra_renderpass *pass)
+{
+    if (!pass)
+        return;
+
+    struct mpvk_ctx *vk = vk_get(ra);
+    struct ra_renderpass_vk *pass_vk = pass->priv;
+
+    ra_buf_pool_uninit(ra, &pass_vk->vbo);
+    vkDestroyPipeline(vk->dev, pass_vk->pipe, MPVK_ALLOCATOR);
+    vkDestroyPipelineCache(vk->dev, pass_vk->pipeCache, MPVK_ALLOCATOR);
+    vkDestroyRenderPass(vk->dev, pass_vk->renderPass, MPVK_ALLOCATOR);
+    vkDestroyPipelineLayout(vk->dev, pass_vk->pipeLayout, MPVK_ALLOCATOR);
+    vkDestroyDescriptorPool(vk->dev, pass_vk->dsPool, MPVK_ALLOCATOR);
+    vkDestroyDescriptorSetLayout(vk->dev, pass_vk->dsLayout, MPVK_ALLOCATOR);
+    vkDestroyShaderModule(vk->dev, pass_vk->vert, MPVK_ALLOCATOR);
+    vkDestroyShaderModule(vk->dev, pass_vk->frag, MPVK_ALLOCATOR);
+    vkDestroyShaderModule(vk->dev, pass_vk->comp, MPVK_ALLOCATOR);
+
+    talloc_free(pass);
+}
+
+MAKE_LAZY_DESTRUCTOR(vk_renderpass_destroy, struct ra_renderpass);
+
+static const VkDescriptorType dsType[] = {
+    [RA_VARTYPE_TEX]    = VK_DESCRIPTOR_TYPE_COMBINED_IMAGE_SAMPLER,
+    [RA_VARTYPE_IMG_W]  = VK_DESCRIPTOR_TYPE_STORAGE_IMAGE,
+    [RA_VARTYPE_BUF_RO] = VK_DESCRIPTOR_TYPE_UNIFORM_BUFFER,
+    [RA_VARTYPE_BUF_RW] = VK_DESCRIPTOR_TYPE_STORAGE_BUFFER,
+};
+
+static bool vk_get_input_format(struct ra *ra, struct ra_renderpass_input *inp,
+                                VkFormat *out_fmt)
+{
+    struct mpvk_ctx *vk = vk_get(ra);
+
+    enum ra_ctype ctype;
+    switch (inp->type) {
+    case RA_VARTYPE_FLOAT:      ctype = RA_CTYPE_FLOAT; break;
+    case RA_VARTYPE_BYTE_UNORM: ctype = RA_CTYPE_UNORM; break;
+    default: abort();
+    }
+
+    assert(inp->dim_m == 1);
+    for (const struct vk_format *fmt = vk_formats; fmt->name; fmt++) {
+        if (fmt->ctype != ctype)
+            continue;
+        if (fmt->components != inp->dim_v)
+            continue;
+        if (fmt->bytes != ra_renderpass_input_layout(inp).size)
+            continue;
+
+        // Ensure this format is valid for vertex attributes
+        VkFormatProperties prop;
+        vkGetPhysicalDeviceFormatProperties(vk->physd, fmt->iformat, &prop);
+        if (!(prop.bufferFeatures & VK_FORMAT_FEATURE_VERTEX_BUFFER_BIT))
+            continue;
+
+        *out_fmt = fmt->iformat;
+        return true;
+    }
+
+    return false;
+}
+
+static const VkPipelineStageFlagBits stageFlags[] = {
+    [RA_RENDERPASS_TYPE_RASTER]  = VK_SHADER_STAGE_FRAGMENT_BIT,
+    [RA_RENDERPASS_TYPE_COMPUTE] = VK_SHADER_STAGE_COMPUTE_BIT,
+};
+
+static struct ra_renderpass *vk_renderpass_create(struct ra *ra,
+                                    const struct ra_renderpass_params *params)
+{
+    struct mpvk_ctx *vk = vk_get(ra);
+
+    struct ra_renderpass *pass = talloc_zero(NULL, struct ra_renderpass);
+    pass->params = *ra_renderpass_params_copy(pass, params);
+    pass->params.cached_program = (bstr){0};
+    struct ra_renderpass_vk *pass_vk = pass->priv =
+        talloc_zero(pass, struct ra_renderpass_vk);
+
+    static int dsCount[RA_VARTYPE_COUNT] = {0};
+    VkDescriptorSetLayoutBinding *bindings = NULL;
+    int num_bindings = 0;
+
+    for (int i = 0; i < params->num_inputs; i++) {
+        struct ra_renderpass_input *inp = &params->inputs[i];
+        switch (inp->type) {
+        case RA_VARTYPE_TEX:
+        case RA_VARTYPE_IMG_W:
+        case RA_VARTYPE_BUF_RO:
+        case RA_VARTYPE_BUF_RW: {
+            VkDescriptorSetLayoutBinding desc = {
+                .binding = inp->binding,
+                .descriptorType = dsType[inp->type],
+                .descriptorCount = 1,
+                .stageFlags = stageFlags[params->type],
+            };
+
+            MP_TARRAY_APPEND(pass, bindings, num_bindings, desc);
+            dsCount[inp->type]++;
+            break;
+        }
+        default: abort();
+        }
+    }
+
+    VkDescriptorPoolSize *dsPoolSizes = NULL;
+    int poolSizeCount = 0;
+    for (enum ra_vartype t = 0; t < RA_VARTYPE_COUNT; t++) {
+        if (dsCount[t] > 0) {
+            VkDescriptorPoolSize dssize = {
+                .type = dsType[t],
+                .descriptorCount = dsCount[t] * MPVK_NUM_DS,
+            };
+
+            MP_TARRAY_APPEND(pass, dsPoolSizes, poolSizeCount, dssize);
+        }
+    }
+
+    VkDescriptorPoolCreateInfo pinfo = {
+        .sType = VK_STRUCTURE_TYPE_DESCRIPTOR_POOL_CREATE_INFO,
+        .maxSets = MPVK_NUM_DS,
+        .pPoolSizes = dsPoolSizes,
+        .poolSizeCount = poolSizeCount,
+    };
+
+    VK(vkCreateDescriptorPool(vk->dev, &pinfo, MPVK_ALLOCATOR, &pass_vk->dsPool));
+    talloc_free(dsPoolSizes);
+
+    pass_vk->dswrite = talloc_array(pass, VkWriteDescriptorSet, num_bindings);
+    pass_vk->dsiinfo = talloc_array(pass, VkDescriptorImageInfo, num_bindings);
+    pass_vk->dsbinfo = talloc_array(pass, VkDescriptorBufferInfo, num_bindings);
+
+    VkDescriptorSetLayoutCreateInfo dinfo = {
+        .sType = VK_STRUCTURE_TYPE_DESCRIPTOR_SET_LAYOUT_CREATE_INFO,
+        .pBindings = bindings,
+        .bindingCount = num_bindings,
+    };
+
+    VK(vkCreateDescriptorSetLayout(vk->dev, &dinfo, MPVK_ALLOCATOR,
+                                   &pass_vk->dsLayout));
+
+    VkDescriptorSetAllocateInfo ainfo = {
+        .sType = VK_STRUCTURE_TYPE_DESCRIPTOR_SET_ALLOCATE_INFO,
+        .descriptorPool = pass_vk->dsPool,
+        .descriptorSetCount = 1,
+        .pSetLayouts = &pass_vk->dsLayout,
+    };
+
+    for (int i = 0; i < MPVK_NUM_DS; i++)
+        VK(vkAllocateDescriptorSets(vk->dev, &ainfo, &pass_vk->dss[i]));
+
+    VkPipelineLayoutCreateInfo linfo = {
+        .sType = VK_STRUCTURE_TYPE_PIPELINE_LAYOUT_CREATE_INFO,
+        .setLayoutCount = 1,
+        .pSetLayouts = &pass_vk->dsLayout,
+    };
+
+    VK(vkCreatePipelineLayout(vk->dev, &linfo, MPVK_ALLOCATOR,
+                              &pass_vk->pipeLayout));
+
+    VkPipelineCacheCreateInfo pcinfo = {
+        .sType = VK_STRUCTURE_TYPE_PIPELINE_CACHE_CREATE_INFO,
+        .pInitialData = params->cached_program.start,
+        .initialDataSize = params->cached_program.len,
+    };
+
+    VK(vkCreatePipelineCache(vk->dev, &pcinfo, MPVK_ALLOCATOR, &pass_vk->pipeCache));
+
+    VkShaderModuleCreateInfo sinfo = {
+        .sType = VK_STRUCTURE_TYPE_SHADER_MODULE_CREATE_INFO,
+    };
+
+    switch (params->type) {
+    case RA_RENDERPASS_TYPE_RASTER: {
+        sinfo.pCode = (uint32_t *)params->vertex_shader;
+        sinfo.codeSize = strlen(params->vertex_shader);
+        VK(vkCreateShaderModule(vk->dev, &sinfo, MPVK_ALLOCATOR, &pass_vk->vert));
+
+        sinfo.pCode = (uint32_t *)params->frag_shader;
+        sinfo.codeSize = strlen(params->frag_shader);
+        VK(vkCreateShaderModule(vk->dev, &sinfo, MPVK_ALLOCATOR, &pass_vk->frag));
+
+        VK(vk_create_render_pass(vk->dev, params->target_format,
+                                 params->enable_blend, &pass_vk->renderPass));
+
+        VkPipelineShaderStageCreateInfo stages[] = {
+            {
+                .sType = VK_STRUCTURE_TYPE_PIPELINE_SHADER_STAGE_CREATE_INFO,
+                .stage = VK_SHADER_STAGE_VERTEX_BIT,
+                .module = pass_vk->vert,
+                .pName = "main",
+            },
+            {
+                .sType = VK_STRUCTURE_TYPE_PIPELINE_SHADER_STAGE_CREATE_INFO,
+                .stage = VK_SHADER_STAGE_FRAGMENT_BIT,
+                .module = pass_vk->frag,
+                .pName = "main",
+            }
+        };
+
+        VkVertexInputAttributeDescription *attrs = talloc_array(pass,
+                VkVertexInputAttributeDescription, params->num_vertex_attribs);
+
+        for (int i = 0; i < params->num_vertex_attribs; i++) {
+            struct ra_renderpass_input *inp = &params->vertex_attribs[i];
+            attrs[i] = (VkVertexInputAttributeDescription) {
+                .location = i,
+                .binding = 0,
+                .offset = inp->offset,
+            };
+
+            if (!vk_get_input_format(ra, inp, &attrs[i].format)) {
+                MP_ERR(ra, "No suitable VkFormat for vertex attrib '%s'!\n",
+                       inp->name);
+                goto error;
+            }
+        }
+
+        static const VkBlendFactor blendFactors[] = {
+            [RA_BLEND_ZERO]                = VK_BLEND_FACTOR_ZERO,
+            [RA_BLEND_ONE]                 = VK_BLEND_FACTOR_ONE,
+            [RA_BLEND_SRC_ALPHA]           = VK_BLEND_FACTOR_SRC_ALPHA,
+            [RA_BLEND_ONE_MINUS_SRC_ALPHA] = VK_BLEND_FACTOR_ONE_MINUS_SRC_ALPHA,
+        };
+
+        VkPipelineColorBlendAttachmentState binfo = {
+            .blendEnable = params->enable_blend,
+            .colorBlendOp = VK_BLEND_OP_ADD,
+            .srcColorBlendFactor = blendFactors[params->blend_src_rgb],
+            .dstColorBlendFactor = blendFactors[params->blend_dst_rgb],
+            .alphaBlendOp = VK_BLEND_OP_ADD,
+            .srcAlphaBlendFactor = blendFactors[params->blend_src_alpha],
+            .dstAlphaBlendFactor = blendFactors[params->blend_dst_alpha],
+            .colorWriteMask = VK_COLOR_COMPONENT_R_BIT |
+                              VK_COLOR_COMPONENT_G_BIT |
+                              VK_COLOR_COMPONENT_B_BIT |
+                              VK_COLOR_COMPONENT_A_BIT,
+        };
+
+        VkGraphicsPipelineCreateInfo cinfo = {
+            .sType = VK_STRUCTURE_TYPE_GRAPHICS_PIPELINE_CREATE_INFO,
+            .stageCount = MP_ARRAY_SIZE(stages),
+            .pStages = &stages[0],
+            .pVertexInputState = &(VkPipelineVertexInputStateCreateInfo) {
+                .sType = VK_STRUCTURE_TYPE_PIPELINE_VERTEX_INPUT_STATE_CREATE_INFO,
+                .vertexBindingDescriptionCount = 1,
+                .pVertexBindingDescriptions = &(VkVertexInputBindingDescription) {
+                    .binding = 0,
+                    .stride = params->vertex_stride,
+                    .inputRate = VK_VERTEX_INPUT_RATE_VERTEX,
+                },
+                .vertexAttributeDescriptionCount = params->num_vertex_attribs,
+                .pVertexAttributeDescriptions = attrs,
+            },
+            .pInputAssemblyState = &(VkPipelineInputAssemblyStateCreateInfo) {
+                .sType = VK_STRUCTURE_TYPE_PIPELINE_INPUT_ASSEMBLY_STATE_CREATE_INFO,
+                .topology = VK_PRIMITIVE_TOPOLOGY_TRIANGLE_LIST,
+            },
+            .pViewportState = &(VkPipelineViewportStateCreateInfo) {
+                .sType = VK_STRUCTURE_TYPE_PIPELINE_VIEWPORT_STATE_CREATE_INFO,
+                .viewportCount = 1,
+                .scissorCount = 1,
+            },
+            .pRasterizationState = &(VkPipelineRasterizationStateCreateInfo) {
+                .sType = VK_STRUCTURE_TYPE_PIPELINE_RASTERIZATION_STATE_CREATE_INFO,
+                .polygonMode = VK_POLYGON_MODE_FILL,
+                .cullMode = VK_CULL_MODE_NONE,
+                .lineWidth = 1.0f,
+            },
+            .pMultisampleState = &(VkPipelineMultisampleStateCreateInfo) {
+                .sType = VK_STRUCTURE_TYPE_PIPELINE_MULTISAMPLE_STATE_CREATE_INFO,
+                .rasterizationSamples = VK_SAMPLE_COUNT_1_BIT,
+            },
+            .pColorBlendState = &(VkPipelineColorBlendStateCreateInfo) {
+                .sType = VK_STRUCTURE_TYPE_PIPELINE_COLOR_BLEND_STATE_CREATE_INFO,
+                .attachmentCount = 1,
+                .pAttachments = &binfo,
+            },
+            .pDynamicState = &(VkPipelineDynamicStateCreateInfo) {
+                .sType = VK_STRUCTURE_TYPE_PIPELINE_DYNAMIC_STATE_CREATE_INFO,
+                .dynamicStateCount = 2,
+                .pDynamicStates = (VkDynamicState[]){
+                    VK_DYNAMIC_STATE_VIEWPORT,
+                    VK_DYNAMIC_STATE_SCISSOR,
+                },
+            },
+            .layout = pass_vk->pipeLayout,
+            .renderPass = pass_vk->renderPass,
+        };
+
+        VK(vkCreateGraphicsPipelines(vk->dev, pass_vk->pipeCache, 1, &cinfo,
+                                     MPVK_ALLOCATOR, &pass_vk->pipe));
+        break;
+    }
+    case RA_RENDERPASS_TYPE_COMPUTE: {
+        sinfo.pCode = (uint32_t *)params->compute_shader;
+        sinfo.codeSize = strlen(params->compute_shader);
+        VK(vkCreateShaderModule(vk->dev, &sinfo, MPVK_ALLOCATOR, &pass_vk->comp));
+
+        VkComputePipelineCreateInfo cinfo = {
+            .sType = VK_STRUCTURE_TYPE_COMPUTE_PIPELINE_CREATE_INFO,
+            .stage = {
+                .sType = VK_STRUCTURE_TYPE_PIPELINE_SHADER_STAGE_CREATE_INFO,
+                .stage = VK_SHADER_STAGE_COMPUTE_BIT,
+                .module = pass_vk->comp,
+                .pName = "main",
+            },
+            .layout = pass_vk->pipeLayout,
+        };
+
+        VK(vkCreateComputePipelines(vk->dev, pass_vk->pipeCache, 1, &cinfo,
+                                    MPVK_ALLOCATOR, &pass_vk->pipe));
+        break;
+    }
+    }
+
+    // Update cached program
+    bstr *prog = &pass->params.cached_program;
+    VK(vkGetPipelineCacheData(vk->dev, pass_vk->pipeCache, &prog->len, NULL));
+    prog->start = talloc_size(pass, prog->len);
+    VK(vkGetPipelineCacheData(vk->dev, pass_vk->pipeCache, &prog->len, prog->start));
+
+    return pass;
+
+error:
+    vk_renderpass_destroy(ra, pass);
+    return NULL;
+}
+
+static void vk_update_descriptor(struct vk_cmd *cmd,
+                                 struct ra_renderpass *pass,
+                                 struct ra_renderpass_input_val val,
+                                 VkDescriptorSet ds, int idx)
+{
+    struct ra_renderpass_vk *pass_vk = pass->priv;
+    struct ra_renderpass_input *inp = &pass->params.inputs[val.index];
+
+    VkWriteDescriptorSet *wds = &pass_vk->dswrite[idx];
+    *wds = (VkWriteDescriptorSet) {
+        .sType = VK_STRUCTURE_TYPE_WRITE_DESCRIPTOR_SET,
+        .dstSet = ds,
+        .dstBinding = inp->binding,
+        .descriptorCount = 1,
+        .descriptorType = dsType[inp->type],
+    };
+
+    switch (inp->type) {
+    case RA_VARTYPE_TEX: {
+        struct ra_tex *tex = *(struct ra_tex **)val.data;
+        struct ra_tex_vk *tex_vk = tex->priv;
+
+        assert(tex->params.render_src);
+        tex_barrier(cmd, tex_vk, VK_PIPELINE_STAGE_FRAGMENT_SHADER_BIT,
+                    VK_ACCESS_SHADER_READ_BIT,
+                    VK_IMAGE_LAYOUT_SHADER_READ_ONLY_OPTIMAL, false);
+
+        VkDescriptorImageInfo *iinfo = &pass_vk->dsiinfo[idx];
+        *iinfo = (VkDescriptorImageInfo) {
+            .sampler = tex_vk->sampler,
+            .imageView = tex_vk->view,
+            .imageLayout = tex_vk->current_layout,
+        };
+
+        wds->pImageInfo = iinfo;
+        break;
+    }
+    case RA_VARTYPE_IMG_W: {
+        struct ra_tex *tex = *(struct ra_tex **)val.data;
+        struct ra_tex_vk *tex_vk = tex->priv;
+
+        assert(tex->params.storage_dst);
+        tex_barrier(cmd, tex_vk, VK_PIPELINE_STAGE_FRAGMENT_SHADER_BIT,
+                    VK_ACCESS_SHADER_WRITE_BIT,
+                    VK_IMAGE_LAYOUT_GENERAL, false);
+
+        VkDescriptorImageInfo *iinfo = &pass_vk->dsiinfo[idx];
+        *iinfo = (VkDescriptorImageInfo) {
+            .imageView = tex_vk->view,
+            .imageLayout = tex_vk->current_layout,
+        };
+
+        wds->pImageInfo = iinfo;
+        break;
+    }
+    case RA_VARTYPE_BUF_RO:
+    case RA_VARTYPE_BUF_RW: {
+        struct ra_buf *buf = *(struct ra_buf **)val.data;
+        struct ra_buf_vk *buf_vk = buf->priv;
+
+        VkBufferUsageFlags access = VK_ACCESS_SHADER_READ_BIT;
+        if (inp->type == RA_VARTYPE_BUF_RW)
+            access |= VK_ACCESS_SHADER_WRITE_BIT;
+
+        buf_barrier(cmd, buf, VK_PIPELINE_STAGE_FRAGMENT_SHADER_BIT,
+                    access, buf_vk->slice.mem.offset, buf->params.size);
+
+        VkDescriptorBufferInfo *binfo = &pass_vk->dsbinfo[idx];
+        *binfo = (VkDescriptorBufferInfo) {
+            .buffer = buf_vk->slice.buf,
+            .offset = buf_vk->slice.mem.offset,
+            .range = buf->params.size,
+        };
+
+        wds->pBufferInfo = binfo;
+        break;
+    }
+    }
+}
+
+static void vk_renderpass_run(struct ra *ra,
+                              const struct ra_renderpass_run_params *params)
+{
+    struct mpvk_ctx *vk = vk_get(ra);
+    struct ra_renderpass *pass = params->pass;
+    struct ra_renderpass_vk *pass_vk = pass->priv;
+
+    struct vk_cmd *cmd = vk_require_cmd(ra);
+    if (!cmd)
+        goto error;
+
+    static const VkPipelineBindPoint bindPoint[] = {
+        [RA_RENDERPASS_TYPE_RASTER]  = VK_PIPELINE_BIND_POINT_GRAPHICS,
+        [RA_RENDERPASS_TYPE_COMPUTE] = VK_PIPELINE_BIND_POINT_COMPUTE,
+    };
+
+    vkCmdBindPipeline(cmd->buf, bindPoint[pass->params.type], pass_vk->pipe);
+
+    VkDescriptorSet ds = pass_vk->dss[pass_vk->dindex++];
+    pass_vk->dindex %= MPVK_NUM_DS;
+
+    for (int i = 0; i < params->num_values; i++)
+        vk_update_descriptor(cmd, pass, params->values[i], ds, i);
+
+    if (params->num_values > 0) {
+        vkUpdateDescriptorSets(vk->dev, params->num_values, pass_vk->dswrite,
+                               0, NULL);
+    }
+
+    vkCmdBindDescriptorSets(cmd->buf, bindPoint[pass->params.type],
+                            pass_vk->pipeLayout, 0, 1, &ds, 0, NULL);
+
+    switch (pass->params.type) {
+    case RA_RENDERPASS_TYPE_COMPUTE:
+        vkCmdDispatch(cmd->buf, params->compute_groups[0],
+                      params->compute_groups[1],
+                      params->compute_groups[2]);
+        break;
+    case RA_RENDERPASS_TYPE_RASTER: {
+        struct ra_tex *tex = params->target;
+        struct ra_tex_vk *tex_vk = tex->priv;
+        assert(tex->params.render_dst);
+
+        struct ra_buf_params buf_params = {
+            .type = RA_BUF_TYPE_VERTEX,
+            .size = params->vertex_count * pass->params.vertex_stride,
+            .host_mutable = true,
+        };
+
+        struct ra_buf *buf = ra_buf_pool_get(ra, &pass_vk->vbo, &buf_params);
+        if (!buf) {
+            MP_ERR(ra, "Failed allocating vertex buffer!\n");
+            goto error;
+        }
+        struct ra_buf_vk *buf_vk = buf->priv;
+
+        vk_buf_update(ra, buf, 0, params->vertex_data, buf_params.size);
+
+        buf_barrier(cmd, buf, VK_PIPELINE_STAGE_VERTEX_INPUT_BIT,
+                    VK_ACCESS_VERTEX_ATTRIBUTE_READ_BIT,
+                    buf_vk->slice.mem.offset, buf_params.size);
+
+        vkCmdBindVertexBuffers(cmd->buf, 0, 1, &buf_vk->slice.buf,
+                               &buf_vk->slice.mem.offset);
+
+        tex_barrier(cmd, tex_vk, VK_PIPELINE_STAGE_COLOR_ATTACHMENT_OUTPUT_BIT,
+                    VK_ACCESS_COLOR_ATTACHMENT_WRITE_BIT,
+                    VK_IMAGE_LAYOUT_COLOR_ATTACHMENT_OPTIMAL, false);
+
+        VkViewport viewport = {
+            .x = params->viewport.x0,
+            .y = params->viewport.y0,
+            .width  = mp_rect_w(params->viewport),
+            .height = mp_rect_h(params->viewport),
+        };
+
+        VkRect2D scissor = {
+            .offset = {params->scissors.x0, params->scissors.y0},
+            .extent = {mp_rect_w(params->scissors), mp_rect_h(params->scissors)},
+        };
+
+        vkCmdSetViewport(cmd->buf, 0, 1, &viewport);
+        vkCmdSetScissor(cmd->buf, 0, 1, &scissor);
+
+        VkRenderPassBeginInfo binfo = {
+            .sType = VK_STRUCTURE_TYPE_RENDER_PASS_BEGIN_INFO,
+            .renderPass = pass_vk->renderPass,
+            .framebuffer = tex_vk->framebuffer,
+            .renderArea = (VkRect2D){{0, 0}, {tex->params.w, tex->params.h}},
+        };
+
+        vkCmdBeginRenderPass(cmd->buf, &binfo, VK_SUBPASS_CONTENTS_INLINE);
+        vkCmdDraw(cmd->buf, params->vertex_count, 1, 0, 0);
+        vkCmdEndRenderPass(cmd->buf);
+        break;
+    }
+    default: abort();
+    };
+
+error:
+    return;
+}
+
+static void vk_blit(struct ra *ra, struct ra_tex *dst, struct ra_tex *src,
+                    struct mp_rect *dst_rc, struct mp_rect *src_rc)
+{
+    assert(src->params.blit_src);
+    assert(dst->params.blit_dst);
+
+    struct ra_tex_vk *src_vk = src->priv;
+    struct ra_tex_vk *dst_vk = dst->priv;
+
+    struct vk_cmd *cmd = vk_require_cmd(ra);
+    if (!cmd)
+        return;
+
+    tex_barrier(cmd, src_vk, VK_PIPELINE_STAGE_TRANSFER_BIT,
+                VK_ACCESS_TRANSFER_READ_BIT,
+                VK_IMAGE_LAYOUT_TRANSFER_SRC_OPTIMAL,
+                false);
+
+    bool discard = dst_rc->x0 == 0 &&
+                   dst_rc->y0 == 0 &&
+                   dst_rc->x1 == dst->params.w &&
+                   dst_rc->y1 == dst->params.h;
+
+    tex_barrier(cmd, dst_vk, VK_PIPELINE_STAGE_TRANSFER_BIT,
+                VK_ACCESS_TRANSFER_WRITE_BIT,
+                VK_IMAGE_LAYOUT_TRANSFER_DST_OPTIMAL,
+                discard);
+
+    VkImageBlit region = {
+        .srcSubresource = vk_layers,
+        .srcOffsets = {{src_rc->x0, src_rc->y0, 0}, {src_rc->x1, src_rc->y1, 1}},
+        .dstSubresource = vk_layers,
+        .dstOffsets = {{dst_rc->x0, dst_rc->y0, 0}, {dst_rc->x1, dst_rc->y1, 1}},
+    };
+
+    vkCmdBlitImage(cmd->buf, src_vk->img, src_vk->current_layout, dst_vk->img,
+                   dst_vk->current_layout, 1, &region, VK_FILTER_NEAREST);
+}
+
+static void vk_clear(struct ra *ra, struct ra_tex *tex, float color[4],
+                     struct mp_rect *rc)
+{
+    struct ra_vk *p = ra->priv;
+    struct ra_tex_vk *tex_vk = tex->priv;
+    assert(tex->params.blit_dst);
+
+    struct vk_cmd *cmd = vk_require_cmd(ra);
+    if (!cmd)
+        return;
+
+    struct mp_rect full = {0, 0, tex->params.w, tex->params.h};
+    if (!rc || mp_rect_equals(rc, &full)) {
+        // To clear the entire image, we can use the efficient clear command
+        tex_barrier(cmd, tex_vk, VK_PIPELINE_STAGE_TRANSFER_BIT,
+                    VK_ACCESS_TRANSFER_WRITE_BIT,
+                    VK_IMAGE_LAYOUT_TRANSFER_DST_OPTIMAL, true);
+
+        VkClearColorValue clearColor = {0};
+        for (int c = 0; c < 4; c++)
+            clearColor.float32[c] = color[c];
+
+        vkCmdClearColorImage(cmd->buf, tex_vk->img, tex_vk->current_layout,
+                             &clearColor, 1, &vk_range);
+    } else {
+        // To simulate per-region clearing, we blit from a 1x1 texture instead
+        struct ra_tex_upload_params ul_params = {
+            .tex = p->clear_tex,
+            .invalidate = true,
+            .src = &color[0],
+        };
+        vk_tex_upload(ra, &ul_params);
+        vk_blit(ra, tex, p->clear_tex, rc, &(struct mp_rect){0, 0, 1, 1});
+    }
+}
+
+#define VK_QUERY_POOL_SIZE (MPVK_MAX_STREAMING_DEPTH * 4)
+
+struct vk_timer {
+    VkQueryPool pool;
+    int index;
+    uint64_t result;
+};
+
+static void vk_timer_destroy(struct ra *ra, ra_timer *ratimer)
+{
+    if (!ratimer)
+        return;
+
+    struct mpvk_ctx *vk = vk_get(ra);
+    struct vk_timer *timer = ratimer;
+
+    vkDestroyQueryPool(vk->dev, timer->pool, MPVK_ALLOCATOR);
+
+    talloc_free(timer);
+}
+
+MAKE_LAZY_DESTRUCTOR(vk_timer_destroy, ra_timer);
+
+static ra_timer *vk_timer_create(struct ra *ra)
+{
+    struct mpvk_ctx *vk = vk_get(ra);
+
+    struct vk_timer *timer = talloc_zero(NULL, struct vk_timer);
+
+    struct VkQueryPoolCreateInfo qinfo = {
+        .sType = VK_STRUCTURE_TYPE_QUERY_POOL_CREATE_INFO,
+        .queryType = VK_QUERY_TYPE_TIMESTAMP,
+        .queryCount = VK_QUERY_POOL_SIZE,
+    };
+
+    VK(vkCreateQueryPool(vk->dev, &qinfo, MPVK_ALLOCATOR, &timer->pool));
+
+    return (ra_timer *)timer;
+
+error:
+    vk_timer_destroy(ra, timer);
+    return NULL;
+}
+
+static void vk_timer_start(struct ra *ra, ra_timer *ratimer)
+{
+    struct mpvk_ctx *vk = vk_get(ra);
+    struct vk_timer *timer = ratimer;
+
+    struct vk_cmd *cmd = vk_require_cmd(ra);
+    if (!cmd)
+        return;
+
+    timer->index = (timer->index + 2) % VK_QUERY_POOL_SIZE;
+
+    uint64_t out[2];
+    VkResult res = vkGetQueryPoolResults(vk->dev, timer->pool, timer->index, 2,
+                                         sizeof(out), &out[0], sizeof(uint64_t),
+                                         VK_QUERY_RESULT_64_BIT);
+    switch (res) {
+    case VK_SUCCESS:
+        timer->result = out[1] - out[0];
+        break;
+    case VK_NOT_READY:
+        timer->result = 0;
+        break;
+    default:
+        MP_WARN(vk, "Failed reading timer query result: %s\n", vk_err(res));
+        return;
+    };
+
+    vkCmdWriteTimestamp(cmd->buf, VK_PIPELINE_STAGE_BOTTOM_OF_PIPE_BIT,
+                        timer->pool, timer->index);
+}
+
+static uint64_t vk_timer_stop(struct ra *ra, ra_timer *ratimer)
+{
+    struct vk_timer *timer = ratimer;
+    struct vk_cmd *cmd = vk_require_cmd(ra);
+
+    if (cmd) {
+        vkCmdWriteTimestamp(cmd->buf, VK_PIPELINE_STAGE_BOTTOM_OF_PIPE_BIT,
+                            timer->pool, timer->index + 1);
+    }
+
+    return timer->result;
+}
+
+static struct ra_fns ra_fns_vk = {
+    .destroy                = vk_destroy_ra,
+    .tex_create             = vk_tex_create,
+    .tex_destroy            = vk_tex_destroy_lazy,
+    .tex_upload             = vk_tex_upload,
+    .buf_create             = vk_buf_create,
+    .buf_destroy            = vk_buf_destroy_lazy,
+    .buf_update             = vk_buf_update,
+    .buf_poll               = vk_buf_poll,
+    .clear                  = vk_clear,
+    .blit                   = vk_blit,
+    .renderpass_create      = vk_renderpass_create,
+    .renderpass_destroy     = vk_renderpass_destroy_lazy,
+    .renderpass_run         = vk_renderpass_run,
+    .timer_create           = vk_timer_create,
+    .timer_destroy          = vk_timer_destroy_lazy,
+    .timer_start            = vk_timer_start,
+    .timer_stop             = vk_timer_stop,
+};
+
+static void present_cb(struct ra *ra, int *inflight)
+{
+    *inflight -= 1;
+}
+
+bool ra_vk_present_frame(struct ra *ra, struct vk_swimg *swimg, int *inflight)
+{
+    struct ra_vk *p = ra->priv;
+    struct mpvk_ctx *vk = vk_get(ra);
+    assert(p->active_cmd);
+
+    if (inflight) {
+        *inflight += 1;
+        vk_callback(ra, (vk_cb)present_cb, inflight);
+    }
+
+    struct ra_tex *img = swimg->image;
+
+    tex_barrier(p->active_cmd, img->priv, VK_PIPELINE_STAGE_BOTTOM_OF_PIPE_BIT,
+                0, VK_IMAGE_LAYOUT_PRESENT_SRC_KHR, false);
+
+    // These are the only two stages that we use/support for actually
+    // outputting to swapchain imagechain images, so just add a dependency
+    // on both of them. In theory, we could maybe come up with some more
+    // advanced mechanism of tracking dynamic dependencies, but that seems
+    // like overkill.
+    vk_cmd_dep(p->active_cmd, swimg->acquired,
+               VK_PIPELINE_STAGE_COLOR_ATTACHMENT_OUTPUT_BIT |
+               VK_PIPELINE_STAGE_TRANSFER_BIT);
+
+    VkSemaphore done;
+    if (!vk_cmd_submit(vk, p->active_cmd, &done))
+        goto error;
+    p->active_cmd = NULL;
+
+    struct vk_cmdpool *pool = vk->pool;
+    VkQueue queue = pool->queues[pool->qindex];
+    pool->qindex %= pool->qcount;
+
+    VkPresentInfoKHR pinfo = {
+        .sType = VK_STRUCTURE_TYPE_PRESENT_INFO_KHR,
+        .waitSemaphoreCount = 1,
+        .pWaitSemaphores = &done,
+        .swapchainCount = 1,
+        .pSwapchains = &swimg->chain->swchain,
+        .pImageIndices = &swimg->index,
+    };
+
+    VK(vkQueuePresentKHR(queue, &pinfo));
+
+    return true;
+
+error:
+    return false;
+}
diff --git a/video/out/vulkan/ra_vk.h b/video/out/vulkan/ra_vk.h
new file mode 100644
index 0000000000000..214a9af6f3552
--- /dev/null
+++ b/video/out/vulkan/ra_vk.h
@@ -0,0 +1,25 @@
+#pragma once
+
+#include "common.h"
+#include "utils.h"
+#include "video/out/opengl/ra.h"
+
+struct ra *ra_create_vk(struct mpvk_ctx *vk, struct mp_log *log);
+
+// Access to the VkDevice is needed for swapchain creation
+VkDevice ra_vk_get_dev(struct ra *ra);
+
+// Allocates a ra_tex that wraps a swapchain image. The contents of the image
+// will be invalidated, and access to it will only be internally synchronized.
+// So the calling could should not do anything else with the VkImage.
+struct ra_tex *ra_vk_wrap_swchain_img(struct ra *ra, VkImage vkimg,
+                                      VkSwapchainCreateInfoKHR info);
+
+// This function flushes the command buffers, and enqueues the image for
+// presentation. This command must only be used after drawing to the vk_swchain,
+// but before the command buffers are flushed for other reasons (for
+// synchronization). The frames_in_flight pointer will be used to track how
+// many frames are currently in flight. (That is, it will be incremented when
+// this function is called, and decremented when the command completes)
+bool ra_vk_present_frame(struct ra *ra, struct vk_swimg *swimg,
+                         int *frames_in_flight);
diff --git a/video/out/vulkan/utils.c b/video/out/vulkan/utils.c
new file mode 100644
index 0000000000000..6c14bce2455d4
--- /dev/null
+++ b/video/out/vulkan/utils.c
@@ -0,0 +1,936 @@
+#include <libavutil/macros.h>
+
+#include "utils.h"
+#include "malloc.h"
+#include "ra_vk.h"
+#include "video/out/x11_common.h"
+
+const char* vk_err(VkResult res)
+{
+    switch (res) {
+    // These are technically success codes, but include them nonetheless
+    case VK_SUCCESS:     return "VK_SUCCESS";
+    case VK_NOT_READY:   return "VK_NOT_READY";
+    case VK_TIMEOUT:     return "VK_TIMEOUT";
+    case VK_EVENT_SET:   return "VK_EVENT_SET";
+    case VK_EVENT_RESET: return "VK_EVENT_RESET";
+    case VK_INCOMPLETE:  return "VK_INCOMPLETE";
+
+    // Actual error codes
+    case VK_ERROR_OUT_OF_HOST_MEMORY:    return "VK_ERROR_OUT_OF_HOST_MEMORY";
+    case VK_ERROR_OUT_OF_DEVICE_MEMORY:  return "VK_ERROR_OUT_OF_DEVICE_MEMORY";
+    case VK_ERROR_INITIALIZATION_FAILED: return "VK_ERROR_INITIALIZATION_FAILED";
+    case VK_ERROR_DEVICE_LOST:           return "VK_ERROR_DEVICE_LOST";
+    case VK_ERROR_MEMORY_MAP_FAILED:     return "VK_ERROR_MEMORY_MAP_FAILED";
+    case VK_ERROR_LAYER_NOT_PRESENT:     return "VK_ERROR_LAYER_NOT_PRESENT";
+    case VK_ERROR_EXTENSION_NOT_PRESENT: return "VK_ERROR_EXTENSION_NOT_PRESENT";
+    case VK_ERROR_FEATURE_NOT_PRESENT:   return "VK_ERROR_FEATURE_NOT_PRESENT";
+    case VK_ERROR_INCOMPATIBLE_DRIVER:   return "VK_ERROR_INCOMPATIBLE_DRIVER";
+    case VK_ERROR_TOO_MANY_OBJECTS:      return "VK_ERROR_TOO_MANY_OBJECTS";
+    case VK_ERROR_FORMAT_NOT_SUPPORTED:  return "VK_ERROR_FORMAT_NOT_SUPPORTED";
+    case VK_ERROR_FRAGMENTED_POOL:       return "VK_ERROR_FRAGMENTED_POOL";
+    }
+
+    return "Unknown error!";
+}
+
+static const char* vk_dbg_type(VkDebugReportObjectTypeEXT type)
+{
+    switch (type) {
+    case VK_DEBUG_REPORT_OBJECT_TYPE_INSTANCE_EXT:
+        return "VkInstance";
+    case VK_DEBUG_REPORT_OBJECT_TYPE_PHYSICAL_DEVICE_EXT:
+        return "VkPhysicalDevice";
+    case VK_DEBUG_REPORT_OBJECT_TYPE_DEVICE_EXT:
+        return "VkDevice";
+    case VK_DEBUG_REPORT_OBJECT_TYPE_QUEUE_EXT:
+        return "VkQueue";
+    case VK_DEBUG_REPORT_OBJECT_TYPE_SEMAPHORE_EXT:
+        return "VkSemaphore";
+    case VK_DEBUG_REPORT_OBJECT_TYPE_COMMAND_BUFFER_EXT:
+        return "VkCommandBuffer";
+    case VK_DEBUG_REPORT_OBJECT_TYPE_FENCE_EXT:
+        return "VkFence";
+    case VK_DEBUG_REPORT_OBJECT_TYPE_DEVICE_MEMORY_EXT:
+        return "VkDeviceMemory";
+    case VK_DEBUG_REPORT_OBJECT_TYPE_BUFFER_EXT:
+        return "VkBuffer";
+    case VK_DEBUG_REPORT_OBJECT_TYPE_IMAGE_EXT:
+        return "VkImage";
+    case VK_DEBUG_REPORT_OBJECT_TYPE_EVENT_EXT:
+        return "VkEvent";
+    case VK_DEBUG_REPORT_OBJECT_TYPE_QUERY_POOL_EXT:
+        return "VkQueryPool";
+    case VK_DEBUG_REPORT_OBJECT_TYPE_BUFFER_VIEW_EXT:
+        return "VkBufferView";
+    case VK_DEBUG_REPORT_OBJECT_TYPE_IMAGE_VIEW_EXT:
+        return "VkImageView";
+    case VK_DEBUG_REPORT_OBJECT_TYPE_SHADER_MODULE_EXT:
+        return "VkShaderModule";
+    case VK_DEBUG_REPORT_OBJECT_TYPE_PIPELINE_CACHE_EXT:
+        return "VkPipelineCache";
+    case VK_DEBUG_REPORT_OBJECT_TYPE_PIPELINE_LAYOUT_EXT:
+        return "VkPipelineLayout";
+    case VK_DEBUG_REPORT_OBJECT_TYPE_RENDER_PASS_EXT:
+        return "VkRenderPass";
+    case VK_DEBUG_REPORT_OBJECT_TYPE_PIPELINE_EXT:
+        return "VkPipeline";
+    case VK_DEBUG_REPORT_OBJECT_TYPE_DESCRIPTOR_SET_LAYOUT_EXT:
+        return "VkDescriptorSetLayout";
+    case VK_DEBUG_REPORT_OBJECT_TYPE_SAMPLER_EXT:
+        return "VkSampler";
+    case VK_DEBUG_REPORT_OBJECT_TYPE_DESCRIPTOR_POOL_EXT:
+        return "VkDescriptorPool";
+    case VK_DEBUG_REPORT_OBJECT_TYPE_DESCRIPTOR_SET_EXT:
+        return "VkDescriptorSet";
+    case VK_DEBUG_REPORT_OBJECT_TYPE_FRAMEBUFFER_EXT:
+        return "VkFramebuffer";
+    case VK_DEBUG_REPORT_OBJECT_TYPE_COMMAND_POOL_EXT:
+        return "VkCommandPool";
+    case VK_DEBUG_REPORT_OBJECT_TYPE_SURFACE_KHR_EXT:
+        return "VkSurfaceKHR";
+    case VK_DEBUG_REPORT_OBJECT_TYPE_SWAPCHAIN_KHR_EXT:
+        return "VkSwapchainKHR";
+    case VK_DEBUG_REPORT_OBJECT_TYPE_DEBUG_REPORT_EXT:
+        return "VkDebugReportCallbackEXT";
+    case VK_DEBUG_REPORT_OBJECT_TYPE_UNKNOWN_EXT:
+    default:
+        return "unknown object";
+    }
+}
+
+static VkBool32 vk_dbg_callback(VkDebugReportFlagsEXT flags,
+                                VkDebugReportObjectTypeEXT objType,
+                                uint64_t obj, size_t loc, int32_t msgCode,
+                                const char *layer, const char *msg, void *priv)
+{
+    struct mpvk_ctx *vk = priv;
+    int lev = MSGL_V;
+
+    switch (flags) {
+    case VK_DEBUG_REPORT_ERROR_BIT_EXT:               lev = MSGL_ERR;   break;
+    case VK_DEBUG_REPORT_WARNING_BIT_EXT:             lev = MSGL_WARN;  break;
+    case VK_DEBUG_REPORT_INFORMATION_BIT_EXT:         lev = MSGL_TRACE; break;
+    case VK_DEBUG_REPORT_PERFORMANCE_WARNING_BIT_EXT: lev = MSGL_WARN;  break;
+    case VK_DEBUG_REPORT_DEBUG_BIT_EXT:               lev = MSGL_DEBUG; break;
+    };
+
+    MP_MSG(vk, lev, "vk [%s] %d: %s (obj 0x%lx (%s), loc 0x%lx)\n",
+            layer, msgCode, msg, obj, vk_dbg_type(objType), loc);
+
+    // The return value of this function determines whether the call will
+    // be explicitly aborted (to prevent GPU errors) or not. In this case,
+    // we generally want this to be on for the errors.
+    return (flags & VK_DEBUG_REPORT_ERROR_BIT_EXT);
+}
+
+void mpvk_uninit(struct mpvk_ctx *vk)
+{
+    if (!vk->inst)
+        return;
+
+    if (vk->dev) {
+        struct vk_cmdpool *pool = vk->pool;
+        // also frees associated command buffers
+        vkDestroyCommandPool(vk->dev, pool->pool, MPVK_ALLOCATOR);
+        for (int n = 0; n < MPVK_MAX_CMDS; n++) {
+            vkDestroyFence(vk->dev, pool->cmds[n].fence, MPVK_ALLOCATOR);
+            vkDestroySemaphore(vk->dev, pool->cmds[n].done, MPVK_ALLOCATOR);
+            talloc_free(pool->cmds[n].callbacks);
+        }
+        talloc_free(vk->pool);
+        vk_malloc_uninit(vk);
+        vkDestroyDevice(vk->dev, MPVK_ALLOCATOR);
+    }
+
+    if (vk->dbg) {
+        // Same deal as creating the debug callback, we need to load this
+        // first.
+        VK_LOAD_PFN(vkDestroyDebugReportCallbackEXT)
+        pfn_vkDestroyDebugReportCallbackEXT(vk->inst, vk->dbg, MPVK_ALLOCATOR);
+    }
+
+    vkDestroySurfaceKHR(vk->inst, vk->surf, MPVK_ALLOCATOR);
+    vkDestroyInstance(vk->inst, MPVK_ALLOCATOR);
+
+    *vk = (struct mpvk_ctx){0};
+}
+
+bool mpvk_instance_init(struct mpvk_ctx *vk, bool debug)
+{
+    VkInstanceCreateInfo info = {
+        .sType = VK_STRUCTURE_TYPE_INSTANCE_CREATE_INFO,
+    };
+
+    if (debug) {
+        // Enables the LunarG standard validation layer, which
+        // is a meta-layer that loads lots of other validators
+        static const char* layers[] = {
+            "VK_LAYER_LUNARG_standard_validation",
+        };
+
+        info.ppEnabledLayerNames = layers;
+        info.enabledLayerCount = MP_ARRAY_SIZE(layers);
+    }
+
+    // Enable whatever extensions were compiled in.
+    static const char *extensions[] = {
+        VK_KHR_SURFACE_EXTENSION_NAME,
+#if HAVE_VULKAN_XLIB
+        VK_KHR_XLIB_SURFACE_EXTENSION_NAME,
+#endif
+
+        // Extra extensions only used for debugging. These are toggled by
+        // decreasing the enabledExtensionCount, so the number needs to be
+        // synchronized with the code below.
+        VK_EXT_DEBUG_REPORT_EXTENSION_NAME,
+    };
+
+    const int debugExtensionCount = 1;
+
+    info.ppEnabledExtensionNames = extensions;
+    info.enabledExtensionCount = MP_ARRAY_SIZE(extensions);
+
+    if (!debug)
+        info.enabledExtensionCount -= debugExtensionCount;
+
+    VkResult res = vkCreateInstance(&info, MPVK_ALLOCATOR, &vk->inst);
+    if (res != VK_SUCCESS) {
+        MP_VERBOSE(vk, "failed creating instance: %s\n", vk_err(res));
+        return false;
+    }
+
+    if (debug) {
+        // Set up a debug callback to catch validation messages
+        VkDebugReportCallbackCreateInfoEXT dinfo = {
+            .sType = VK_STRUCTURE_TYPE_DEBUG_REPORT_CALLBACK_CREATE_INFO_EXT,
+            .flags = VK_DEBUG_REPORT_INFORMATION_BIT_EXT |
+                     VK_DEBUG_REPORT_WARNING_BIT_EXT |
+                     VK_DEBUG_REPORT_PERFORMANCE_WARNING_BIT_EXT |
+                     VK_DEBUG_REPORT_ERROR_BIT_EXT |
+                     VK_DEBUG_REPORT_DEBUG_BIT_EXT,
+            .pfnCallback = vk_dbg_callback,
+            .pUserData = vk,
+        };
+
+        // Since this is not part of the core spec, we need to load it. This
+        // can't fail because we've already successfully created an instance
+        // with this extension enabled.
+        VK_LOAD_PFN(vkCreateDebugReportCallbackEXT)
+        pfn_vkCreateDebugReportCallbackEXT(vk->inst, &dinfo, MPVK_ALLOCATOR,
+                                           &vk->dbg);
+    }
+
+    return true;
+}
+
+#define MPVK_MAX_DEVICES 16
+
+static bool physd_supports_surface(struct mpvk_ctx *vk, VkPhysicalDevice physd)
+{
+    uint32_t qfnum;
+    vkGetPhysicalDeviceQueueFamilyProperties(physd, &qfnum, NULL);
+
+    for (int i = 0; i < qfnum; i++) {
+        VkBool32 sup;
+        VK(vkGetPhysicalDeviceSurfaceSupportKHR(physd, i, vk->surf, &sup));
+        if (sup)
+            return true;
+    }
+
+error:
+    return false;
+}
+
+bool mpvk_find_phys_device(struct mpvk_ctx *vk, const char *name, bool sw)
+{
+    assert(vk->surf);
+
+    MP_VERBOSE(vk, "Probing for vulkan devices..\n");
+
+    VkPhysicalDevice *devices = NULL;
+    uint32_t num = 0;
+    VK(vkEnumeratePhysicalDevices(vk->inst, &num, NULL));
+    devices = talloc_array(NULL, VkPhysicalDevice, num);
+    VK(vkEnumeratePhysicalDevices(vk->inst, &num, devices));
+
+    // Sorted by "priority". Reuses some m_opt code for convenience
+    static const struct m_opt_choice_alternatives types[] = {
+        {"discrete",   VK_PHYSICAL_DEVICE_TYPE_DISCRETE_GPU},
+        {"integrated", VK_PHYSICAL_DEVICE_TYPE_INTEGRATED_GPU},
+        {"virtual",    VK_PHYSICAL_DEVICE_TYPE_VIRTUAL_GPU},
+        {"software",   VK_PHYSICAL_DEVICE_TYPE_CPU},
+        {"unknown",    VK_PHYSICAL_DEVICE_TYPE_OTHER},
+        {0}
+    };
+
+    VkPhysicalDeviceProperties props[MPVK_MAX_DEVICES];
+    for (int i = 0; i < num; i++) {
+        vkGetPhysicalDeviceProperties(devices[i], &props[i]);
+        MP_VERBOSE(vk, "GPU %d: %s (%s)\n", i, props[i].deviceName,
+                   m_opt_choice_str(types, props[i].deviceType));
+    }
+
+    // Iterate through each type in order of decreasing preference
+    for (int t = 0; types[t].name; t++) {
+        // Disallow SW rendering unless explicitly enabled
+        if (types[t].value == VK_PHYSICAL_DEVICE_TYPE_CPU && !sw)
+            continue;
+
+        for (int i = 0; i < num; i++) {
+            VkPhysicalDeviceProperties prop = props[i];
+            if (prop.deviceType != types[t].value)
+                continue;
+            if (name && strcmp(name, prop.deviceName) != 0)
+                continue;
+            if (!physd_supports_surface(vk, devices[i]))
+                continue;
+
+            MP_VERBOSE(vk, "Found device:\n");
+            MP_VERBOSE(vk, "  Device Name: %s\n", prop.deviceName);
+            MP_VERBOSE(vk, "  Device ID: %x:%x\n", prop.vendorID, prop.deviceID);
+            MP_VERBOSE(vk, "  Driver version: %d\n", prop.driverVersion);
+            MP_VERBOSE(vk, "  API version: %d.%d.%d\n",
+                    VK_VERSION_MAJOR(prop.apiVersion),
+                    VK_VERSION_MINOR(prop.apiVersion),
+                    VK_VERSION_PATCH(prop.apiVersion));
+            vk->physd = devices[i];
+            vk->limits = prop.limits;
+            talloc_free(devices);
+            return true;
+        }
+    }
+
+error:
+    MP_VERBOSE(vk, "Found no suitable device, giving up.\n");
+    talloc_free(devices);
+    return false;
+}
+
+bool mpvk_pick_surface_format(struct mpvk_ctx *vk)
+{
+    assert(vk->physd);
+
+    VkSurfaceFormatKHR *formats = NULL;
+    int num;
+
+    // Enumerate through the surface formats and find one that we can map to
+    // a ra_format
+    VK(vkGetPhysicalDeviceSurfaceFormatsKHR(vk->physd, vk->surf, &num, NULL));
+    formats = talloc_array(NULL, VkSurfaceFormatKHR, num);
+    VK(vkGetPhysicalDeviceSurfaceFormatsKHR(vk->physd, vk->surf, &num, formats));
+
+    for (int i = 0; i < num; i++) {
+        // A value of VK_FORMAT_UNDEFINED means we can pick anything we want
+        if (formats[i].format == VK_FORMAT_UNDEFINED) {
+            vk->surf_format = (VkSurfaceFormatKHR) {
+                .colorSpace = VK_COLOR_SPACE_SRGB_NONLINEAR_KHR,
+                .format = VK_FORMAT_R8G8B8A8_UNORM,
+            };
+            break;
+        }
+
+        if (formats[i].colorSpace != VK_COLOR_SPACE_SRGB_NONLINEAR_KHR)
+            continue;
+
+        vk->surf_format = formats[i];
+        break;
+    }
+
+    talloc_free(formats);
+
+    if (!vk->surf_format.format)
+        goto error;
+
+    return true;
+
+error:
+    MP_ERR(vk, "Failed picking surface format!\n");
+    talloc_free(formats);
+    return false;
+}
+
+bool mpvk_surface_init(struct vo *vo, struct mpvk_ctx *vk)
+{
+    assert(vk->inst);
+    VkResult res;
+
+#if HAVE_VULKAN_XLIB
+    if (!vo_x11_init(vo))
+        goto xlib_uninit;
+
+    if (!vo_x11_create_vo_window(vo, NULL, "mpvk"))
+        goto xlib_uninit;
+
+    VkXlibSurfaceCreateInfoKHR xinfo = {
+         .sType = VK_STRUCTURE_TYPE_XLIB_SURFACE_CREATE_INFO_KHR,
+         .dpy = vo->x11->display,
+         .window = vo->x11->window,
+    };
+
+    res = vkCreateXlibSurfaceKHR(vk->inst, &xinfo, MPVK_ALLOCATOR, &vk->surf);
+    if (res != VK_SUCCESS) {
+        MP_VERBOSE(vo, "Failed creating Xlib surface: %s\n", vk_err(res));
+        goto xlib_uninit;
+    }
+
+    MP_VERBOSE(vo, "Using Xlib surface.\n");
+    return true;
+
+xlib_uninit:
+    vo_x11_uninit(vo);
+#endif
+
+    // If we're reached this point, then none of the above surface probes
+    // were successful
+    MP_ERR(vo, "Failed creating any useful vulkan surface!\n");
+    return false;
+}
+
+bool mpvk_device_init(struct mpvk_ctx *vk)
+{
+    assert(vk->physd);
+
+    VkQueueFamilyProperties *qfs = NULL;
+    int qfnum;
+
+    // Enumerate the queue families and find suitable families for each task
+    vkGetPhysicalDeviceQueueFamilyProperties(vk->physd, &qfnum, NULL);
+    qfs = talloc_array(NULL, VkQueueFamilyProperties, qfnum);
+    vkGetPhysicalDeviceQueueFamilyProperties(vk->physd, &qfnum, qfs);
+
+    MP_VERBOSE(vk, "Queue families supported by device:\n");
+
+    for (int i = 0; i < qfnum; i++) {
+        MP_VERBOSE(vk, "QF %d: flags 0x%x num %d\n", i, qfs[i].queueFlags,
+                   qfs[i].queueCount);
+    }
+
+    // Since using multiple queue families is devilishly difficult, we just
+    // pick a single queue family and stick with it. So in the interest of this,
+    // it's best to pick the one that supports the most features.
+
+    int idx = -1;
+    for (int i = 0; i < qfnum; i++) {
+        if (!(qfs[i].queueFlags & VK_QUEUE_GRAPHICS_BIT))
+            continue;
+
+        // QF supports more features
+        if (idx < 0 || qfs[i].queueFlags > qfs[idx].queueFlags)
+            idx = i;
+
+        // QF supports more queues (at the same specialization level)
+        if (qfs[i].queueFlags == qfs[idx].queueFlags &&
+            qfs[i].queueCount > qfs[idx].queueCount)
+        {
+            idx = i;
+        }
+    }
+
+    // Vulkan requires at least one GRAPHICS queue, so if this fails something
+    // is horribly wrong.
+    assert(idx >= 0);
+
+    // Now that we know which queue family we want, we can create the logical
+    // device
+    static const float priorities[MPVK_MAX_QUEUES] = {0};
+    VkDeviceQueueCreateInfo qinfo = {
+        .sType = VK_STRUCTURE_TYPE_DEVICE_QUEUE_CREATE_INFO,
+        .queueFamilyIndex = idx,
+        .queueCount = MPMIN(qfs[idx].queueCount, MPVK_MAX_QUEUES),
+        .pQueuePriorities = priorities,
+    };
+
+    static const char *exts[] = {
+        VK_KHR_SWAPCHAIN_EXTENSION_NAME,
+        VK_NV_GLSL_SHADER_EXTENSION_NAME,
+    };
+
+    VkDeviceCreateInfo dinfo = {
+        .sType = VK_STRUCTURE_TYPE_DEVICE_CREATE_INFO,
+        .queueCreateInfoCount = 1,
+        .pQueueCreateInfos = &qinfo,
+        .ppEnabledExtensionNames = exts,
+        .enabledExtensionCount = MP_ARRAY_SIZE(exts),
+    };
+
+    MP_VERBOSE(vk, "Creating vulkan device...\n");
+    VK(vkCreateDevice(vk->physd, &dinfo, MPVK_ALLOCATOR, &vk->dev));
+
+    vk_malloc_init(vk);
+
+    // Create the vk_cmdpool and all required queues / synchronization objects
+    struct vk_cmdpool *pool = vk->pool = talloc_zero(NULL, struct vk_cmdpool);
+    *pool = (struct vk_cmdpool) {
+        .qf = qinfo.queueFamilyIndex,
+        .props = qfs[qinfo.queueFamilyIndex],
+        .qcount = qinfo.queueCount,
+    };
+
+    talloc_free(qfs);
+
+    for (int n = 0; n < pool->qcount; n++)
+        vkGetDeviceQueue(vk->dev, pool->qf, n, &pool->queues[n]);
+
+    VkCommandPoolCreateInfo cinfo = {
+        .sType = VK_STRUCTURE_TYPE_COMMAND_POOL_CREATE_INFO,
+        .flags = VK_COMMAND_POOL_CREATE_TRANSIENT_BIT |
+                 VK_COMMAND_POOL_CREATE_RESET_COMMAND_BUFFER_BIT,
+        .queueFamilyIndex = pool->qf,
+    };
+
+    VK(vkCreateCommandPool(vk->dev, &cinfo, MPVK_ALLOCATOR, &pool->pool));
+
+    VkCommandBufferAllocateInfo ainfo = {
+        .sType = VK_STRUCTURE_TYPE_COMMAND_BUFFER_ALLOCATE_INFO,
+        .commandPool = pool->pool,
+        .level = VK_COMMAND_BUFFER_LEVEL_PRIMARY,
+        .commandBufferCount = MPVK_MAX_CMDS,
+    };
+
+    VkCommandBuffer cmdbufs[MPVK_MAX_CMDS];
+    VK(vkAllocateCommandBuffers(vk->dev, &ainfo, cmdbufs));
+
+    for (int n = 0; n < MPVK_MAX_CMDS; n++) {
+        struct vk_cmd *cmd = &pool->cmds[n];
+        cmd->pool = pool;
+        cmd->buf = cmdbufs[n];
+
+        VkFenceCreateInfo finfo = {
+            .sType = VK_STRUCTURE_TYPE_FENCE_CREATE_INFO,
+            .flags = VK_FENCE_CREATE_SIGNALED_BIT,
+        };
+
+        VK(vkCreateFence(vk->dev, &finfo, MPVK_ALLOCATOR, &cmd->fence));
+
+        VkSemaphoreCreateInfo sinfo = {
+            .sType = VK_STRUCTURE_TYPE_SEMAPHORE_CREATE_INFO,
+        };
+
+        VK(vkCreateSemaphore(vk->dev, &sinfo, MPVK_ALLOCATOR, &cmd->done));
+    }
+
+    // Ensure we can actually present to the surface using this queue
+    VkBool32 sup;
+    VK(vkGetPhysicalDeviceSurfaceSupportKHR(vk->physd, pool->qf, vk->surf, &sup));
+    if (!sup) {
+        MP_ERR(vk, "Queue family does not support surface presentation!\n");
+        goto error;
+    }
+
+    return true;
+
+error:
+    MP_ERR(vk, "Failed creating logical device!\n");
+    talloc_free(qfs);
+    return false;
+}
+
+static void run_callbacks(struct vk_cmd *cmd)
+{
+    for (int i = 0; i < cmd->num_callbacks; i++) {
+        struct vk_callback *cb = &cmd->callbacks[i];
+        cb->run(cb->priv, cb->arg);
+        *cb = (struct vk_callback){0};
+    }
+
+    cmd->num_callbacks = 0;
+}
+
+static void wait_for_cmds(struct mpvk_ctx *vk, struct vk_cmd cmds[], int num)
+{
+    if (!num)
+        return;
+
+    VkFence fences[MPVK_MAX_CMDS];
+    for (int i = 0; i < num; i++)
+        fences[i] = cmds[i].fence;
+
+    vkWaitForFences(vk->dev, num, fences, true, UINT64_MAX);
+
+    for (int i = 0; i < num; i++)
+        run_callbacks(&cmds[i]);
+}
+
+void mpvk_wait_idle(struct mpvk_ctx *vk)
+{
+    struct vk_cmdpool *pool = vk->pool;
+
+    int idx = pool->cindex, pidx = pool->cindex_pending;
+    if (pidx < idx) { // range doesn't wrap
+        wait_for_cmds(vk, &pool->cmds[pidx], idx - pidx);
+    } else if (pidx > idx) { // range wraps
+        wait_for_cmds(vk, &pool->cmds[pidx], MPVK_MAX_CMDS - pidx);
+        wait_for_cmds(vk, &pool->cmds[0], idx);
+    }
+    pool->cindex_pending = pool->cindex;
+}
+
+void mpvk_poll_cmds(struct mpvk_ctx *vk, struct vk_cmdpool *pool,
+                    uint64_t timeout)
+{
+    // If requested, hard block until at least one command completes
+    if (timeout > 0 && pool->cindex_pending != pool->cindex) {
+        vkWaitForFences(vk->dev, 1, &pool->cmds[pool->cindex_pending].fence,
+                        true, timeout);
+    }
+
+    // Lazily garbage collect the commands based on their status
+    while (pool->cindex_pending != pool->cindex) {
+        struct vk_cmd *cmd = &pool->cmds[pool->cindex_pending];
+        VkResult res = vkGetFenceStatus(vk->dev, cmd->fence);
+        if (res != VK_SUCCESS)
+            break;
+        run_callbacks(cmd);
+        pool->cindex_pending++;
+        pool->cindex_pending %= MPVK_MAX_CMDS;
+    }
+}
+
+void vk_dev_callback(struct mpvk_ctx *vk, vk_cb callback, void *p, void *arg)
+{
+    struct vk_cmdpool *pool = vk->pool;
+    if (pool->cindex_pending == pool->cindex) {
+        // The device was already idle, so we can just immediately call it
+        callback(p, arg);
+        return;
+    }
+
+    int prev_idx = pool->cindex - 1;
+    if (prev_idx < 0)
+        prev_idx += MPVK_MAX_CMDS;
+
+    struct vk_cmd *last_cmd = &pool->cmds[prev_idx];
+    vk_cmd_callback(last_cmd, callback, p, arg);
+}
+
+const VkImageSubresourceRange vk_range = {
+    .aspectMask = VK_IMAGE_ASPECT_COLOR_BIT,
+    .levelCount = 1,
+    .layerCount = 1,
+};
+
+const VkImageSubresourceLayers vk_layers = {
+    .aspectMask = VK_IMAGE_ASPECT_COLOR_BIT,
+    .layerCount = 1,
+};
+
+void vk_cmd_callback(struct vk_cmd *cmd, vk_cb callback, void *p, void *arg)
+{
+    MP_TARRAY_GROW(NULL, cmd->callbacks, cmd->num_callbacks);
+    cmd->callbacks[cmd->num_callbacks++] = (struct vk_callback) {
+        .run  = callback,
+        .priv = p,
+        .arg  = arg,
+    };
+}
+
+void vk_cmd_dep(struct vk_cmd *cmd, VkSemaphore dep,
+                VkPipelineStageFlagBits depstage)
+{
+    assert(cmd->num_deps < MPVK_MAX_CMD_DEPS);
+    cmd->deps[cmd->num_deps] = dep;
+    cmd->depstages[cmd->num_deps++] = depstage;
+}
+
+struct vk_cmd *vk_cmd_begin(struct mpvk_ctx *vk, struct vk_cmdpool *pool)
+{
+    // Garbage collect the cmdpool first
+    mpvk_poll_cmds(vk, pool, 0);
+
+    int next = (pool->cindex + 1) % MPVK_MAX_CMDS;
+    if (next == pool->cindex_pending) {
+        MP_ERR(vk, "No free command buffers!\n");
+        goto error;
+    }
+
+    struct vk_cmd *cmd = &pool->cmds[pool->cindex];
+    pool->cindex = next;
+
+    VK(vkResetCommandBuffer(cmd->buf, 0));
+
+    VkCommandBufferBeginInfo binfo = {
+        .sType = VK_STRUCTURE_TYPE_COMMAND_BUFFER_BEGIN_INFO,
+        .flags = VK_COMMAND_BUFFER_USAGE_ONE_TIME_SUBMIT_BIT,
+    };
+
+    VK(vkBeginCommandBuffer(cmd->buf, &binfo));
+
+    return cmd;
+
+error:
+    return NULL;
+}
+
+bool vk_cmd_submit(struct mpvk_ctx *vk, struct vk_cmd *cmd, VkSemaphore *done)
+{
+    VK(vkEndCommandBuffer(cmd->buf));
+
+    struct vk_cmdpool *pool = cmd->pool;
+    VkQueue queue = pool->queues[pool->qindex++];
+    pool->qindex %= pool->qcount;
+
+    VkSubmitInfo sinfo = {
+        .sType = VK_STRUCTURE_TYPE_SUBMIT_INFO,
+        .commandBufferCount = 1,
+        .pCommandBuffers = &cmd->buf,
+        .waitSemaphoreCount = cmd->num_deps,
+        .pWaitSemaphores = cmd->deps,
+        .pWaitDstStageMask = cmd->depstages,
+    };
+
+    if (done) {
+        sinfo.signalSemaphoreCount = 1;
+        sinfo.pSignalSemaphores = &cmd->done;
+        *done = cmd->done;
+    }
+
+    VK(vkResetFences(vk->dev, 1, &cmd->fence));
+    VK(vkQueueSubmit(queue, 1, &sinfo, cmd->fence));
+    MP_TRACE(vk, "Submitted command on queue %p\n", (void *)queue);
+
+    for (int i = 0; i < cmd->num_deps; i++)
+        cmd->deps[i] = NULL;
+    cmd->num_deps = 0;
+
+    return true;
+
+error:
+    return false;
+}
+
+static bool vk_swchain_update_info(struct vk_swchain *chain,
+                                   VkSwapchainCreateInfoKHR *info)
+{
+    struct mpvk_ctx *vk = chain->vk;
+
+    // Query the supported capabilities and update this struct as needed
+    VkSurfaceCapabilitiesKHR caps;
+    VK(vkGetPhysicalDeviceSurfaceCapabilitiesKHR(vk->physd, vk->surf, &caps));
+
+    // Sorted by preference
+    static const VkCompositeAlphaFlagBitsKHR alphaModes[] = {
+        VK_COMPOSITE_ALPHA_POST_MULTIPLIED_BIT_KHR,
+        VK_COMPOSITE_ALPHA_OPAQUE_BIT_KHR,
+    };
+
+    for (int i = 0; i < MP_ARRAY_SIZE(alphaModes); i++) {
+        if (caps.supportedCompositeAlpha & alphaModes[i]) {
+            info->compositeAlpha = alphaModes[i];
+            break;
+        }
+    }
+
+    if (!info->compositeAlpha) {
+        MP_ERR(vk, "Failed picking alpha compositing mode (caps: %d)\n",
+               caps.supportedCompositeAlpha);
+        goto error;
+    }
+
+    static const VkSurfaceTransformFlagBitsKHR rotModes[] = {
+        VK_SURFACE_TRANSFORM_IDENTITY_BIT_KHR,
+        VK_SURFACE_TRANSFORM_INHERIT_BIT_KHR,
+    };
+
+    for (int i = 0; i < MP_ARRAY_SIZE(rotModes); i++) {
+        if (caps.supportedTransforms & rotModes[i]) {
+            info->preTransform = rotModes[i];
+            break;
+        }
+    }
+
+    if (!info->preTransform) {
+        MP_ERR(vk, "Failed picking surface transform mode (caps: %d)\n",
+               caps.supportedTransforms);
+        goto error;
+    }
+
+    // Image count as required
+    info->minImageCount = MPMAX(info->minImageCount, caps.minImageCount);
+    if (caps.maxImageCount)
+        info->minImageCount = MPMIN(info->minImageCount, caps.maxImageCount);
+
+    // Check the extend against the allowed parameters
+    if (caps.currentExtent.width != info->imageExtent.width &&
+        caps.currentExtent.width != 0xFFFFFFFF)
+    {
+        MP_WARN(vk, "Requested width %d does not match current width %d\n",
+               info->imageExtent.width, caps.currentExtent.width);
+        info->imageExtent.width = caps.currentExtent.width;
+    }
+
+    if (caps.currentExtent.height != info->imageExtent.height &&
+        caps.currentExtent.height != 0xFFFFFFFF)
+    {
+        MP_WARN(vk, "Requested height %d does not match current height %d\n",
+               info->imageExtent.height, caps.currentExtent.height);
+        info->imageExtent.height = caps.currentExtent.height;
+    }
+
+    if (caps.minImageExtent.width  > info->imageExtent.width ||
+        caps.minImageExtent.height > info->imageExtent.height)
+    {
+        MP_ERR(vk, "Requested size %dx%d smaller than device minimum %d%d\n",
+               info->imageExtent.width, info->imageExtent.height,
+               caps.minImageExtent.width, caps.minImageExtent.height);
+        goto error;
+    }
+
+    if (caps.maxImageExtent.width  < info->imageExtent.width ||
+        caps.maxImageExtent.height < info->imageExtent.height)
+    {
+        MP_ERR(vk, "Requested size %dx%d larger than device maximum %d%d\n",
+               info->imageExtent.width, info->imageExtent.height,
+               caps.maxImageExtent.width, caps.maxImageExtent.height);
+        goto error;
+    }
+
+    // We just request whatever usage we can, and let the ra_vk decide what
+    // ra_tex_params that translates to. This makes the images as flexible
+    // as possible.
+    info->imageUsage = caps.supportedUsageFlags;
+    return true;
+
+error:
+    return false;
+}
+
+bool vk_swchain_init(struct mpvk_ctx *vk, struct ra *ra, int size,
+                     struct vk_swchain *chain)
+{
+    assert(vk->dev);
+    assert(vk->surf_format.format);
+
+    struct VkSwapchainCreateInfoKHR dummy = {
+        .sType = VK_STRUCTURE_TYPE_SWAPCHAIN_CREATE_INFO_KHR,
+        .surface = vk->surf,
+        .minImageCount = size,
+        .imageFormat = vk->surf_format.format,
+        .imageColorSpace = vk->surf_format.colorSpace,
+        .imageArrayLayers = 1, // non-stereoscopic
+        .imageSharingMode = VK_SHARING_MODE_EXCLUSIVE,
+        .presentMode = VK_PRESENT_MODE_FIFO_KHR,
+        .clipped = true,
+    };
+
+    *chain = (struct vk_swchain) {
+        .vk = vk,
+        .ra = ra,
+        .protoInfo = dummy,
+    };
+
+    return true;
+}
+
+void vk_swchain_uninit(struct ra *ra, struct vk_swchain *chain)
+{
+    struct mpvk_ctx *vk = chain->vk;
+    if (!vk)
+        return;
+
+    // Note: We technically don't even need the struct *ra, it's just there
+    // to "force" the correct uninitialization order at the API level. Either
+    // way, make sure the RA actually matches..
+    assert(ra == chain->ra);
+
+    mpvk_wait_idle(vk);
+
+    for (int i = 0; i < chain->num_images; i++)
+        ra_tex_free(ra, &chain->images[i]);
+    for (int i = 0; i < chain->num_acquired; i++)
+        vkDestroySemaphore(vk->dev, chain->acquired[i], MPVK_ALLOCATOR);
+
+    vkDestroySwapchainKHR(vk->dev, chain->swchain, MPVK_ALLOCATOR);
+
+    talloc_free(chain->images);
+    talloc_free(chain->acquired);
+    *chain = (struct vk_swchain){0};
+}
+
+static void destroy_swapchain(struct mpvk_ctx *vk, VkSwapchainKHR swchain)
+{
+    vkDestroySwapchainKHR(vk->dev, swchain, MPVK_ALLOCATOR);
+}
+
+bool vk_swchain_resize(struct vk_swchain *chain, int w, int h)
+{
+    if (w == chain->w && h == chain->h)
+        return true;
+
+    struct mpvk_ctx *vk = chain->vk;
+    VkImage *vkimages = NULL;
+    bool ret = false;
+
+    VkSwapchainCreateInfoKHR sinfo = chain->protoInfo;
+    sinfo.imageExtent  = (VkExtent2D){ w, h };
+    sinfo.oldSwapchain = chain->swchain;
+
+    if (!vk_swchain_update_info(chain, &sinfo))
+        goto error;
+
+    VK(vkCreateSwapchainKHR(vk->dev, &sinfo, MPVK_ALLOCATOR, &chain->swchain));
+    chain->w = w;
+    chain->h = h;
+
+    // Freeing the old swapchain while it's still in use is an error, so do
+    // it asynchronously once the device is idle.
+    if (sinfo.oldSwapchain)
+        vk_dev_callback(vk, (vk_cb) destroy_swapchain, vk, sinfo.oldSwapchain);
+
+    // Get the new swapchain images
+    int num;
+    VK(vkGetSwapchainImagesKHR(vk->dev, chain->swchain, &num, NULL));
+    vkimages = talloc_array(NULL, VkImage, num);
+    VK(vkGetSwapchainImagesKHR(vk->dev, chain->swchain, &num, vkimages));
+
+    // If needed, allocate some more semaphores
+    while (num > chain->num_acquired) {
+        VkSemaphore sem;
+        static const VkSemaphoreCreateInfo seminfo = {
+            .sType = VK_STRUCTURE_TYPE_SEMAPHORE_CREATE_INFO,
+        };
+        VK(vkCreateSemaphore(vk->dev, &seminfo, MPVK_ALLOCATOR, &sem));
+        MP_TARRAY_APPEND(NULL, chain->acquired, chain->num_acquired, sem);
+    }
+
+    // Recreate the ra_tex wrappers
+    for (int i = 0; i < chain->num_images; i++)
+        ra_tex_free(chain->ra, &chain->images[i]);
+
+    chain->num_images = num;
+    MP_TARRAY_GROW(NULL, chain->images, chain->num_images);
+    for (int i = 0; i < num; i++) {
+        chain->images[i] = ra_vk_wrap_swchain_img(chain->ra, vkimages[i], sinfo);
+        if (!chain->images[i])
+            goto error;
+    }
+
+    ret = true;
+
+error:
+    talloc_free(vkimages);
+    return ret;
+}
+
+bool vk_swchain_get(struct vk_swchain *chain, struct vk_swimg *out)
+{
+    struct mpvk_ctx *vk = chain->vk;
+
+    int semidx = chain->idx_acquired++;
+    chain->idx_acquired %= chain->num_acquired;
+
+    uint32_t imgidx = 0;
+    VK(vkAcquireNextImageKHR(vk->dev, chain->swchain, UINT64_MAX,
+                             chain->acquired[semidx], NULL, &imgidx));
+
+    *out = (struct vk_swimg) {
+        .chain = chain,
+        .index = imgidx,
+        .image = chain->images[imgidx],
+        .acquired = chain->acquired[semidx],
+    };
+    return true;
+
+error:
+    return false;
+}
diff --git a/video/out/vulkan/utils.h b/video/out/vulkan/utils.h
new file mode 100644
index 0000000000000..6273ebca95ef4
--- /dev/null
+++ b/video/out/vulkan/utils.h
@@ -0,0 +1,178 @@
+#pragma once
+
+#include "video/out/vo.h"
+#include "video/mp_image.h"
+
+#include "common.h"
+#include "formats.h"
+
+#define VK_LOAD_PFN(name) PFN_##name pfn_##name = (PFN_##name) \
+                            vkGetInstanceProcAddr(vk->inst, #name);
+
+// Return a human-readable name for various struct mpvk_ctx enums
+const char* vk_err(VkResult res);
+
+// Convenience macros to simplify a lot of common boilerplate
+#define VK_ASSERT(res, str)                               \
+    if (res != VK_SUCCESS) {                              \
+        MP_ERR(vk, str ": %s\n", vk_err(res));            \
+        goto error;                                       \
+    }
+
+#define VK(cmd)                                           \
+    {                                                     \
+        MP_TRACE(vk, #cmd "\n");                          \
+        VkResult res ## __LINE__ = (cmd);                 \
+        VK_ASSERT(res ## __LINE__, #cmd);                 \
+    }
+
+// Uninits everything in the correct order
+void mpvk_uninit(struct mpvk_ctx *vk);
+
+// Initialization functions: As a rule of thumb, these need to be called in
+// this order, followed by vk_malloc_init, followed by RA initialization, and
+// finally followed by vk_swchain initialization.
+
+// Create a vulkan instance. Returns VK_NULL_HANDLE on failure
+bool mpvk_instance_init(struct mpvk_ctx *vk, bool validate);
+
+// Generate a VkSurfaceKHR usable for video output. Returns VK_NULL_HANDLE on
+// failure. Must be called after mpvk_instance_init.
+bool mpvk_surface_init(struct vo *vo, struct mpvk_ctx *vk);
+
+// Find a suitable physical device for use with rendering and which supports
+// the surface.
+// name: only match a device with this name
+// sw: also allow software/virtual devices
+bool mpvk_find_phys_device(struct mpvk_ctx *vk, const char *name, bool sw);
+
+// Pick a suitable surface format that's supported by this physical device.
+bool mpvk_pick_surface_format(struct mpvk_ctx *vk);
+
+// Create a logical device and initialize the vk_cmdpools
+bool mpvk_device_init(struct mpvk_ctx *vk);
+
+// Wait until all commands submitted to all queues have completed
+void mpvk_wait_idle(struct mpvk_ctx *vk);
+
+// Wait until at least one command submitted to any queue has completed, and
+// process the callbacks. Good for event loops that need to delay until a
+// command completes. Will block at most `timeout` nanoseconds. If used with
+// 0, it only garbage collects completed commands without blocking.
+void mpvk_poll_cmds(struct mpvk_ctx *vk, struct vk_cmdpool *pool,
+                    uint64_t timeout);
+
+// Predefined structs for a simple non-layered, non-mipped image
+extern const VkImageSubresourceRange  vk_range;
+extern const VkImageSubresourceLayers vk_layers;
+
+// Since lots of vulkan operations need to be done lazily once the affected
+// resources are no longer in use, provide an abstraction for tracking these.
+// In practice, these are only checked and run when submitting new commands, so
+// the actual execution may be delayed by a frame.
+typedef void (*vk_cb)(void *priv, void *arg);
+
+struct vk_callback {
+    vk_cb run;
+    void *priv;
+    void *arg; // as a convenience, you also get to pass an arg for "free"
+};
+
+// Associate a callback with the completion of all currently pending commands.
+// This will essentially run once the device is completely idle.
+void vk_dev_callback(struct mpvk_ctx *vk, vk_cb callback, void *p, void *arg);
+
+#define MPVK_MAX_CMD_DEPS 8
+
+// Helper wrapper around command buffers that also track dependencies,
+// callbacks and synchronization primitives
+struct vk_cmd {
+    struct vk_cmdpool *pool; // pool it was allocated from
+    VkCommandBuffer buf;
+    VkFence fence; // the fence guards cmd buffer reuse
+    VkSemaphore done; // the semaphore signals when execution is done
+    // The semaphores represent dependencies that need to complete before
+    // this command can be executed. These are *not* owned by the vk_cmd
+    VkSemaphore deps[MPVK_MAX_CMD_DEPS];
+    VkPipelineStageFlags depstages[MPVK_MAX_CMD_DEPS];
+    int num_deps;
+    // Since VkFences are useless, we have to manually track "callbacks"
+    // to fire once the VkFence completes. These are used for multiple purposes,
+    // ranging from garbage collection (resource deallocation) to fencing.
+    struct vk_callback *callbacks;
+    int num_callbacks;
+};
+
+// Associate a callback with the completion of the current command. This
+// bool will be set to `true` once the command completes, or shortly thereafter.
+void vk_cmd_callback(struct vk_cmd *cmd, vk_cb callback, void *p, void *arg);
+
+// Associate a dependency for the current command. This semaphore must signal
+// by the corresponding stage before the command may execute.
+void vk_cmd_dep(struct vk_cmd *cmd, VkSemaphore dep,
+                VkPipelineStageFlagBits depstage);
+
+#define MPVK_MAX_QUEUES 8
+#define MPVK_MAX_CMDS 16
+
+// Command pool / queue family hybrid abstraction
+struct vk_cmdpool {
+    VkQueueFamilyProperties props;
+    uint32_t qf; // queue family index
+    VkCommandPool pool;
+    VkQueue queues[MPVK_MAX_QUEUES];
+    int qcount;
+    int qindex;
+    // Command buffers associated with this queue. (No, VkCommandPool is not
+    // a pool of command buffers), you still have to pool them manually. We
+    // also have to track of "in flight" (pending) command buffers separately
+    // to work around vkQueueWaitIdle being completely fucking useless when
+    // using a queue for presentation.
+    struct vk_cmd cmds[MPVK_MAX_CMDS];
+    int cindex;
+    int cindex_pending;
+};
+
+// Fetch the next command buffer from a command pool and begin recording to it.
+// Returns NULL on failure.
+struct vk_cmd *vk_cmd_begin(struct mpvk_ctx *vk, struct vk_cmdpool *pool);
+
+// Finish the currently recording command buffer and submit it for execution.
+// If `done` is not NULL, it will be set to a semaphore that will signal once
+// the command completes. (And MUST have a corresponding semaphore wait)
+// Returns whether successful.
+bool vk_cmd_submit(struct mpvk_ctx *vk, struct vk_cmd *cmd, VkSemaphore *done);
+
+// Swapchain
+struct vk_swchain {
+    struct mpvk_ctx *vk;
+    struct ra *ra;
+    int w, h;                 // current size
+    VkSwapchainCreateInfoKHR protoInfo; // partially filled-in prototype
+    VkSwapchainKHR swchain;
+    // state of the images:
+    struct ra_tex **images;   // ra_tex wrappers for the vkimages
+    int num_images;           // size of images
+    VkSemaphore *acquired;    // pool of semaphores used to synchronize images
+    int num_acquired;         // size of this pool
+    int idx_acquired;         // index of next free semaphore within this pool
+};
+
+// depth: desired depth
+bool vk_swchain_init(struct mpvk_ctx *vk, struct ra *ra, int depth,
+                     struct vk_swchain *chain);
+void vk_swchain_uninit(struct ra *ra, struct vk_swchain *chain);
+bool vk_swchain_resize(struct vk_swchain *chain, int w, int h);
+
+// Swapchain image
+struct vk_swimg {
+    struct vk_swchain *chain; // vk_swchain it was allocated from
+    int index;                // index within that vk_swchain
+    struct ra_tex *image;     // ra_tex wrapper for the this image
+    VkSemaphore acquired;     // will be signalled once the image is ready
+};
+
+// Get the next vk_swimg. This may block if the swapchain images are exceeded,
+// but normally the user should allocate a larger swapchain than what they
+// actually use.
+bool vk_swchain_get(struct vk_swchain *chain, struct vk_swimg *out);
diff --git a/wscript b/wscript
index 9d885884d7476..42e5e726650da 100644
--- a/wscript
+++ b/wscript
@@ -780,6 +780,16 @@ video_output_features = [
         'fmsg': "No OpenGL video output found or enabled. " +
                 "Aborting. If you really mean to compile without OpenGL " +
                 "video outputs use --disable-gl."
+    }, {
+        'name': '--vulkan-xlib',
+        'desc': 'Vulkan Xlib backend',
+        'func': check_true,
+        'deps': ['x11'],
+    }, {
+        'name': '--vulkan',
+        'desc':  'Vulkan video output',
+        'deps_any': [ 'vulkan-xlib' ],
+        'func': check_cc(header_name='vulkan/vulkan.h', lib='vulkan'),
     }, {
         'name': 'egl-helpers',
         'desc': 'EGL helper functions',
diff --git a/wscript_build.py b/wscript_build.py
index 3c5c00dc6415e..878b1faf02513 100644
--- a/wscript_build.py
+++ b/wscript_build.py
@@ -437,12 +437,17 @@ def build(ctx):
         ( "video/out/vo_tct.c" ),
         ( "video/out/vo_vaapi.c",                "vaapi-x11" ),
         ( "video/out/vo_vdpau.c",                "vdpau" ),
+        ( "video/out/vo_vulkan.c",               "vulkan" ),
         ( "video/out/vo_wayland.c",              "wayland" ),
         ( "video/out/vo_x11.c" ,                 "x11" ),
         ( "video/out/vo_xv.c",                   "xv" ),
         ( "video/out/w32_common.c",              "win32-desktop" ),
         ( "video/out/win32/displayconfig.c",     "win32-desktop" ),
         ( "video/out/win32/droptarget.c",        "win32-desktop" ),
+        ( "video/out/vulkan/utils.c",            "vulkan" ),
+        ( "video/out/vulkan/malloc.c",           "vulkan" ),
+        ( "video/out/vulkan/formats.c",          "vulkan" ),
+        ( "video/out/vulkan/ra_vk.c",            "vulkan" ),
         ( "video/out/win32/exclusive_hack.c",    "gl-win32" ),
         ( "video/out/wayland_common.c",          "wayland" ),
         ( "video/out/wayland/buffer.c",          "wayland" ),