From c2d769aa7c5e925e2790a7a92223f542c01fba6e Mon Sep 17 00:00:00 2001 From: Niklas Haas Date: Wed, 14 Sep 2016 20:54:18 +0200 Subject: [PATCH] vo_vulkan: initial implementation This time based on RA. 2017 is the year of the vulkan desktop! Current problems / limitations / improvement opportunities: 1. The entire thing depends on VK_NV_glsl_shader, which is a god-awful nvidia-exclusive hack that barely works and is held together with duct tape and prayers. Long-term, we really, REALLY need to figure out a way to use a GLSL->SPIR-V middleware like glslang. The problem with glslang in particular is that it's a gigantic pile of awful, but maybe time will help here.. 2. We don't use async transfer at all. This is very difficult, but doable in theory with the newer design. Would require refactoring vk_cmdpool slightly, and also expanding ra_vk.active_cmd to include commands on the async queue as well. Also, async compute is pretty much impossible to benefit from because we need to pingpong with serial dependencies anyway. (Sorry AMD users, you fell for the async compute meme) 3. The custom memory allocator is pretty naive. It's prone to under-allocating memory, allocation thrashing, freeing slabs too aggressively, and general slowness due to allocating from the same thread. In addition to making it smarter, we should also make it multi-threaded: ideally it would free slabs from a different thread, and also pre-allocate slabs from a different thread if it reaches some critical "low" threshold on the amount of available bytes. (Perhaps relative to the current heap size). These limitations manifest themselves as occasional choppy performance when changing the window size. 4. The swapchain code and ANGLE's swapchain code could share common options somehow. Left away for now because I don't want to deal with that headache for the time being. 5. The swapchain/flipping code violates the vulkan spec, by assuming that the presentation queue will be bounded (in cases where rendering is significantly faster than vsync). But apparently, there's simply no better way to do this right now, to the point where even the stupid cube.c examples from LunarG etc. do it wrong. (cf. https://github.com/KhronosGroup/Vulkan-Docs/issues/370) --- video/out/opengl/ra.h | 9 +- video/out/opengl/utils.c | 3 +- video/out/vo.c | 4 + video/out/vo_vulkan.c | 335 ++++++++ video/out/vulkan/common.h | 50 ++ video/out/vulkan/formats.c | 55 ++ video/out/vulkan/formats.h | 16 + video/out/vulkan/malloc.c | 315 +++++++ video/out/vulkan/malloc.h | 35 + video/out/vulkan/ra_vk.c | 1588 ++++++++++++++++++++++++++++++++++++ video/out/vulkan/ra_vk.h | 25 + video/out/vulkan/utils.c | 936 +++++++++++++++++++++ video/out/vulkan/utils.h | 178 ++++ wscript | 10 + wscript_build.py | 5 + 15 files changed, 3559 insertions(+), 5 deletions(-) create mode 100644 video/out/vo_vulkan.c create mode 100644 video/out/vulkan/common.h create mode 100644 video/out/vulkan/formats.c create mode 100644 video/out/vulkan/formats.h create mode 100644 video/out/vulkan/malloc.c create mode 100644 video/out/vulkan/malloc.h create mode 100644 video/out/vulkan/ra_vk.c create mode 100644 video/out/vulkan/ra_vk.h create mode 100644 video/out/vulkan/utils.c create mode 100644 video/out/vulkan/utils.h diff --git a/video/out/opengl/ra.h b/video/out/opengl/ra.h index ae7fb9aea730a..1f716d98f8bdc 100644 --- a/video/out/opengl/ra.h +++ b/video/out/opengl/ra.h @@ -146,6 +146,7 @@ enum ra_buf_type { RA_BUF_TYPE_TEX_UPLOAD, // texture upload buffer (pixel buffer object) RA_BUF_TYPE_SHADER_STORAGE, // shader buffer (SSBO), for RA_VARTYPE_BUF_RW RA_BUF_TYPE_UNIFORM, // uniform buffer (UBO), for RA_VARTYPE_BUF_RO + RA_BUF_TYPE_VERTEX, // not publicly usable (RA-internal usage) }; struct ra_buf_params { @@ -369,10 +370,10 @@ struct ra_fns { void (*buf_destroy)(struct ra *ra, struct ra_buf *buf); - // Update the contents of a buffer, starting at a given offset and up to a - // given size, with the contents of *data. This is an extremely common - // operation. Calling this while the buffer is considered "in use" is an - // error. (See: buf_poll) + // Update the contents of a buffer, starting at a given offset (*must* be a + // multiple of 4) and up to a given size, with the contents of *data. This + // is an extremely common operation. Calling this while the buffer is + // considered "in use" is an error. (See: buf_poll) void (*buf_update)(struct ra *ra, struct ra_buf *buf, ptrdiff_t offset, const void *data, size_t size); diff --git a/video/out/opengl/utils.c b/video/out/opengl/utils.c index b8fc24a52e133..aeadd346b9408 100644 --- a/video/out/opengl/utils.c +++ b/video/out/opengl/utils.c @@ -64,7 +64,8 @@ static bool ra_buf_pool_grow(struct ra *ra, struct ra_buf_pool *pool) return false; MP_TARRAY_INSERT_AT(NULL, pool->buffers, pool->num_buffers, pool->index, buf); - MP_VERBOSE(ra, "Resized buffer pool to size %d\n", pool->num_buffers); + MP_VERBOSE(ra, "Resized buffer pool of type %u to size %d\n", + pool->current_params.type, pool->num_buffers); return true; } diff --git a/video/out/vo.c b/video/out/vo.c index f9c5d04e24be0..06507c7f87694 100644 --- a/video/out/vo.c +++ b/video/out/vo.c @@ -60,6 +60,7 @@ extern const struct vo_driver video_out_drm; extern const struct vo_driver video_out_direct3d; extern const struct vo_driver video_out_sdl; extern const struct vo_driver video_out_vaapi; +extern const struct vo_driver video_out_vulkan; extern const struct vo_driver video_out_wayland; extern const struct vo_driver video_out_rpi; extern const struct vo_driver video_out_tct; @@ -78,6 +79,9 @@ const struct vo_driver *const video_out_drivers[] = #if HAVE_DIRECT3D &video_out_direct3d, #endif +#if HAVE_VULKAN + &video_out_vulkan, +#endif #if HAVE_WAYLAND &video_out_wayland, #endif diff --git a/video/out/vo_vulkan.c b/video/out/vo_vulkan.c new file mode 100644 index 0000000000000..9e6c7984c6a3d --- /dev/null +++ b/video/out/vo_vulkan.c @@ -0,0 +1,335 @@ +/* + * This file is part of mpv. + * + * mpv is free software; you can redistribute it and/or + * modify it under the terms of the GNU Lesser General Public + * License as published by the Free Software Foundation; either + * version 2.1 of the License, or (at your option) any later version. + * + * mpv is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + * GNU Lesser General Public License for more details. + * + * You should have received a copy of the GNU Lesser General Public + * License along with mpv. If not, see . + */ + +#include "mpv_talloc.h" +#include "options/m_config.h" +#include "osdep/timer.h" +#include "video/mp_image.h" +#include "video/out/x11_common.h" +#include "vo.h" +#include "sub/osd.h" + +#include "opengl/ra.h" +#include "opengl/video.h" + +#include "vulkan/common.h" +#include "vulkan/utils.h" +#include "vulkan/ra_vk.h" + +struct vo_vulkan_opts { + int debug; // whether to load the validation layers or not + int allow_sw; // whether to allow software devices + char *device; // force a specific GPU + int swsize; // swapchain size + int swdepth; // swapchain depth +}; + +struct vk_priv { + struct vo *vo; + struct mp_log *log; + + struct vo_vulkan_opts opts; + + struct mpvk_ctx vk; + struct ra *ra; + struct gl_video *renderer; + + struct vk_swchain swchain; + int frames_in_flight; +}; + +static bool resize(struct vo *vo) +{ + struct vk_priv *p = vo->priv; + + MP_VERBOSE(vo, "Resize: %dx%d\n", vo->dwidth, vo->dheight); + + if (!vk_swchain_resize(&p->swchain, vo->dwidth, vo->dheight)) { + MP_ERR(vo, "Failed resizing swapchain!\n"); + return false; + } + + struct mp_rect src, dst; + struct mp_osd_res osd; + vo_get_src_dst_rects(vo, &src, &dst, &osd); + + gl_video_resize(p->renderer, &src, &dst, &osd); + + vo->want_redraw = true; + return true; +} + +static int reconfig(struct vo *vo, struct mp_image_params *params) +{ + struct vk_priv *p = vo->priv; + + if (vo->x11) + vo_x11_config_vo_window(vo); + + if (!resize(vo)) + return VO_ERROR; + + gl_video_config(p->renderer, params); + + return 0; +} + +static void uninit(struct vo *vo) +{ + struct vk_priv *p = vo->priv; + struct mpvk_ctx *vk = &p->vk; + + gl_video_uninit(p->renderer); + + if (p->ra) { + vk_swchain_uninit(p->ra, &p->swchain); + p->ra->fns->destroy(p->ra); + } + + // Clean up platform-specific windowing stuff. Do this first to prevent + // keeping around the window for long, then we can uninit the device etc. + // afterwards + if (vo->x11) + vo_x11_uninit(vo); + + mpvk_uninit(vk); +} + +static int preinit(struct vo *vo) +{ + struct vk_priv *p = vo->priv; + struct mpvk_ctx *vk = &p->vk; + p->vo = vo; + p->log = vk->log = vo->log; + + if (!mpvk_instance_init(vk, p->opts.debug)) + goto error; + if (!mpvk_surface_init(vo, vk)) + goto error; + if (!mpvk_find_phys_device(vk, p->opts.device, p->opts.allow_sw)) + goto error; + if (!mpvk_pick_surface_format(vk)) + goto error; + if (!mpvk_device_init(vk)) + goto error; + p->ra = ra_create_vk(vk, p->log); + if (!p->ra) + goto error; + if (!vk_swchain_init(vk, p->ra, p->opts.swsize, &p->swchain)) + goto error; + + p->renderer = gl_video_init(p->ra, vo->log, vo->global); + gl_video_set_osd_source(p->renderer, vo->osd); + gl_video_configure_queue(p->renderer, vo); + + return 0; + +error: + uninit(vo); + return -1; +} + +static int control(struct vo *vo, uint32_t request, void *data) +{ + struct vk_priv *p = vo->priv; + + switch (request) { + case VOCTRL_SET_PANSCAN: + return resize(vo) ? VO_TRUE : VO_ERROR; + case VOCTRL_SET_EQUALIZER: + vo->want_redraw = true; + return VO_TRUE; + case VOCTRL_UPDATE_RENDER_OPTS: { + gl_video_update_options(p->renderer); + gl_video_configure_queue(p->renderer, p->vo); + p->vo->want_redraw = true; + return true; + } + case VOCTRL_RESET: + gl_video_reset(p->renderer); + return true; + case VOCTRL_PAUSE: + if (gl_video_showing_interpolated_frame(p->renderer)) + vo->want_redraw = true; + return true; + case VOCTRL_PERFORMANCE_DATA: + gl_video_perfdata(p->renderer, (struct voctrl_performance_data *)data); + return true; + } + + int events = 0, r = 0; + + if (vo->x11) + r |= vo_x11_control(vo, &events, request, data); + + if (events & VO_EVENT_RESIZE) + r |= resize(vo) ? 0 : VO_ERROR; + + if (events & VO_EVENT_EXPOSE) + vo->want_redraw = true; + + vo_event(vo, events); + return r; +} + +static void draw_frame(struct vo *vo, struct vo_frame *frame) +{ + struct vk_priv *p = vo->priv; + struct vk_swimg swimg; + if (!vk_swchain_get(&p->swchain, &swimg)) + goto error; + + struct fbodst target = { + .tex = swimg.image, + .flip = false, + }; + + gl_video_render_frame(p->renderer, frame, target); + if (!ra_vk_present_frame(p->ra, &swimg, &p->frames_in_flight)) { + MP_ERR(vo, "Failed presenting frame!\n"); + goto error; + } + +error: + return; +} + +static void flip_page(struct vo *vo) +{ + struct vk_priv *p = vo->priv; + while (p->frames_in_flight >= p->opts.swdepth) + mpvk_poll_cmds(&p->vk, p->vk.pool, UINT64_MAX); +} + +static int query_format(struct vo *vo, int format) +{ + struct vk_priv *p = vo->priv; + if (!gl_video_check_format(p->renderer, format)) + return 0; + return 1; +} + +static void wakeup(struct vo *vo) +{ + if (vo->x11) + vo_x11_wakeup(vo); +} + +static void wait_events(struct vo *vo, int64_t until_time_us) +{ + if (vo->x11) { + vo_x11_wait_events(vo, until_time_us); + } else { + vo_wait_default(vo, until_time_us); + } +} + +static struct mp_image *get_image(struct vo *vo, int imgfmt, int w, int h, + int stride_align) +{ + struct vk_priv *p = vo->priv; + return gl_video_get_image(p->renderer, imgfmt, w, h, stride_align); +} + +static int vk_validate_dev(struct mp_log *log, const struct m_option *opt, + struct bstr name, struct bstr param) +{ + int ret = M_OPT_INVALID; + VkResult res; + + // Create a dummy instance to validate/list the devices + VkInstanceCreateInfo info = { + .sType = VK_STRUCTURE_TYPE_INSTANCE_CREATE_INFO, + }; + + VkInstance inst; + VkPhysicalDevice *devices = NULL; + uint32_t num = 0; + + res = vkCreateInstance(&info, MPVK_ALLOCATOR, &inst); + if (res != VK_SUCCESS) + goto error; + + res = vkEnumeratePhysicalDevices(inst, &num, NULL); + if (res != VK_SUCCESS) + goto error; + + devices = talloc_array(NULL, VkPhysicalDevice, num); + vkEnumeratePhysicalDevices(inst, &num, devices); + if (res != VK_SUCCESS) + goto error; + + bool help = bstr_equals0(param, "help"); + if (help) { + mp_info(log, "Available vulkan devices:\n"); + ret = M_OPT_EXIT; + } + + for (int i = 0; i < num; i++) { + VkPhysicalDeviceProperties prop; + vkGetPhysicalDeviceProperties(devices[i], &prop); + + if (help) { + mp_info(log, " '%s' (GPU %d, ID %x:%x)\n", prop.deviceName, i, + prop.vendorID, prop.deviceID); + } else if (bstr_equals0(param, prop.deviceName)) { + ret = 0; + break; + } + } + + if (!help) + mp_err(log, "No device with name '%.*s'!\n", BSTR_P(param)); + +error: + talloc_free(devices); + return ret; +} + +#define OPT_BASE_STRUCT struct vk_priv + +const struct vo_driver video_out_vulkan = { + .description = "Vulkan Renderer", + .name = "vulkan", + .preinit = preinit, + .query_format = query_format, + .reconfig = reconfig, + .control = control, + .get_image = get_image, + .draw_frame = draw_frame, + .flip_page = flip_page, + .wait_events = wait_events, + .wakeup = wakeup, + .uninit = uninit, + .priv_size = sizeof(struct vk_priv), + .options = (const m_option_t[]) { + OPT_FLAG("vulkan-debug", opts.debug, 0), + OPT_FLAG("vulkan-sw", opts.allow_sw, 0), + OPT_STRING_VALIDATE("vulkan-device", opts.device, 0, vk_validate_dev), + OPT_INTRANGE("vulkan-swapchain-size", opts.swsize, 0, 1, + MPVK_MAX_STREAMING_DEPTH), + OPT_INTRANGE("vulkan-swapchain-depth", opts.swdepth, 0, 1, + MPVK_MAX_STREAMING_DEPTH), + {0} + }, + .priv_defaults = &(const struct vk_priv) { + .opts = { + .swsize = 8, + .swdepth = 1, + }, + }, +}; diff --git a/video/out/vulkan/common.h b/video/out/vulkan/common.h new file mode 100644 index 0000000000000..9113d27a6a201 --- /dev/null +++ b/video/out/vulkan/common.h @@ -0,0 +1,50 @@ +#pragma once + +#include +#include +#include +#include +#include + +#include "config.h" + +#include "common/common.h" +#include "common/msg.h" + +// We need to define all platforms we want to support. Since we have +// our own mechanism for checking this, we re-define the right symbols +#if HAVE_VULKAN_XLIB +#define VK_USE_PLATFORM_XLIB_KHR +#endif + +#include + +// Vulkan allows the optional use of a custom allocator. We don't need one but +// mark this parameter with a better name in case we ever decide to change this +// in the future. (And to make the code more readable) +#define MPVK_ALLOCATOR NULL + +// A lot of things depend on streaming resources across frames. Depending on +// how many frames we render ahead of time, we need to pick enough to avoid +// any conflicts, so make all of these tunable relative to this constant in +// order to centralize them. +#define MPVK_MAX_STREAMING_DEPTH 8 + +// Shared struct used to hold vulkan context information +struct mpvk_ctx { + struct mp_log *log; + VkInstance inst; + VkPhysicalDevice physd; + VkDebugReportCallbackEXT dbg; + VkDevice dev; + + // Surface, must be initialized fter the context itself + VkSurfaceKHR surf; + VkSurfaceFormatKHR surf_format; // picked at surface initialization time + + struct vk_malloc *alloc; // memory allocator for this device + struct vk_cmdpool *pool; // command pool for this device + + // Cached capabilities + VkPhysicalDeviceLimits limits; +}; diff --git a/video/out/vulkan/formats.c b/video/out/vulkan/formats.c new file mode 100644 index 0000000000000..b44bead99cc80 --- /dev/null +++ b/video/out/vulkan/formats.c @@ -0,0 +1,55 @@ +#include "formats.h" + +const struct vk_format vk_formats[] = { + // Regular, byte-aligned integer formats + {"r8", VK_FORMAT_R8_UNORM, 1, 1, {8 }, RA_CTYPE_UNORM }, + {"rg8", VK_FORMAT_R8G8_UNORM, 2, 2, {8, 8 }, RA_CTYPE_UNORM }, + {"rgb8", VK_FORMAT_R8G8B8_UNORM, 3, 3, {8, 8, 8 }, RA_CTYPE_UNORM }, + {"rgba8", VK_FORMAT_R8G8B8A8_UNORM, 4, 4, {8, 8, 8, 8 }, RA_CTYPE_UNORM }, + {"r16", VK_FORMAT_R16_UNORM, 1, 2, {16 }, RA_CTYPE_UNORM }, + {"rg16", VK_FORMAT_R16G16_UNORM, 2, 4, {16, 16 }, RA_CTYPE_UNORM }, + {"rgb16", VK_FORMAT_R16G16B16_UNORM, 3, 6, {16, 16, 16 }, RA_CTYPE_UNORM }, + {"rgba16", VK_FORMAT_R16G16B16A16_UNORM, 4, 8, {16, 16, 16, 16}, RA_CTYPE_UNORM }, + + // Special, integer-only formats + {"r32ui", VK_FORMAT_R32_UINT, 1, 4, {32 }, RA_CTYPE_UINT }, + {"rg32ui", VK_FORMAT_R32G32_UINT, 2, 8, {32, 32 }, RA_CTYPE_UINT }, + {"rgb32ui", VK_FORMAT_R32G32B32_UINT, 3, 12, {32, 32, 32 }, RA_CTYPE_UINT }, + {"rgba32ui", VK_FORMAT_R32G32B32A32_UINT, 4, 16, {32, 32, 32, 32}, RA_CTYPE_UINT }, + {"r64ui", VK_FORMAT_R64_UINT, 1, 8, {64 }, RA_CTYPE_UINT }, + {"rg64ui", VK_FORMAT_R64G64_UINT, 2, 16, {64, 64 }, RA_CTYPE_UINT }, + {"rgb64ui", VK_FORMAT_R64G64B64_UINT, 3, 24, {64, 64, 64 }, RA_CTYPE_UINT }, + {"rgba64ui", VK_FORMAT_R64G64B64A64_UINT, 4, 32, {64, 64, 64, 64}, RA_CTYPE_UINT }, + + // Packed integer formats + {"rg4", VK_FORMAT_R4G4_UNORM_PACK8, 2, 1, {4, 4 }, RA_CTYPE_UNORM }, + {"rgba4", VK_FORMAT_R4G4B4A4_UNORM_PACK16, 4, 2, {4, 4, 4, 4 }, RA_CTYPE_UNORM }, + {"rgb565", VK_FORMAT_R5G6B5_UNORM_PACK16, 3, 2, {5, 6, 5 }, RA_CTYPE_UNORM }, + {"rgb565a1", VK_FORMAT_R5G5B5A1_UNORM_PACK16, 4, 2, {5, 5, 5, 1 }, RA_CTYPE_UNORM }, + + // Float formats (native formats, hf = half float, df = double float) + {"r16hf", VK_FORMAT_R16_SFLOAT, 1, 2, {16 }, RA_CTYPE_FLOAT }, + {"rg16hf", VK_FORMAT_R16G16_SFLOAT, 2, 4, {16, 16 }, RA_CTYPE_FLOAT }, + {"rgb16hf", VK_FORMAT_R16G16B16_SFLOAT, 3, 6, {16, 16, 16 }, RA_CTYPE_FLOAT }, + {"rgba16hf", VK_FORMAT_R16G16B16A16_SFLOAT, 4, 8, {16, 16, 16, 16}, RA_CTYPE_FLOAT }, + {"r32f", VK_FORMAT_R32_SFLOAT, 1, 4, {32 }, RA_CTYPE_FLOAT }, + {"rg32f", VK_FORMAT_R32G32_SFLOAT, 2, 8, {32, 32 }, RA_CTYPE_FLOAT }, + {"rgb32f", VK_FORMAT_R32G32B32_SFLOAT, 3, 12, {32, 32, 32 }, RA_CTYPE_FLOAT }, + {"rgba32f", VK_FORMAT_R32G32B32A32_SFLOAT, 4, 16, {32, 32, 32, 32}, RA_CTYPE_FLOAT }, + {"r64df", VK_FORMAT_R64_SFLOAT, 1, 8, {64 }, RA_CTYPE_FLOAT }, + {"rg64df", VK_FORMAT_R64G64_SFLOAT, 2, 16, {64, 64 }, RA_CTYPE_FLOAT }, + {"rgb64df", VK_FORMAT_R64G64B64_SFLOAT, 3, 24, {64, 64, 64 }, RA_CTYPE_FLOAT }, + {"rgba64df", VK_FORMAT_R64G64B64A64_SFLOAT, 4, 32, {64, 64, 64, 64}, RA_CTYPE_FLOAT }, + + // "Swapped" component order images + {"bgr8", VK_FORMAT_B8G8R8_UNORM, 3, 3, {8, 8, 8 }, RA_CTYPE_UNORM, true }, + {"bgra8", VK_FORMAT_B8G8R8A8_UNORM, 4, 4, {8, 8, 8, 8 }, RA_CTYPE_UNORM, true }, + {"bgra4", VK_FORMAT_B4G4R4A4_UNORM_PACK16, 4, 2, {4, 4, 4, 4 }, RA_CTYPE_UNORM, true }, + {"bgr565", VK_FORMAT_B5G6R5_UNORM_PACK16, 3, 2, {5, 6, 5 }, RA_CTYPE_UNORM, true }, + {"bgr565a1", VK_FORMAT_B5G5R5A1_UNORM_PACK16, 4, 2, {5, 5, 5, 1 }, RA_CTYPE_UNORM, true }, + {"a1rgb5", VK_FORMAT_A1R5G5B5_UNORM_PACK16, 4, 2, {1, 5, 5, 5 }, RA_CTYPE_UNORM, true }, + {"a2rgb10", VK_FORMAT_A2R10G10B10_UNORM_PACK32, 4, 4, {2, 10, 10, 10}, RA_CTYPE_UNORM, true }, + {"a2bgr10", VK_FORMAT_A2B10G10R10_UNORM_PACK32, 4, 4, {2, 10, 10, 10}, RA_CTYPE_UNORM, true }, + {"abgr8", VK_FORMAT_A8B8G8R8_UNORM_PACK32, 4, 4, {8, 8, 8, 8 }, RA_CTYPE_UNORM, true }, + {0} +}; diff --git a/video/out/vulkan/formats.h b/video/out/vulkan/formats.h new file mode 100644 index 0000000000000..e57275a153a12 --- /dev/null +++ b/video/out/vulkan/formats.h @@ -0,0 +1,16 @@ +#pragma once + +#include "video/out/opengl/ra.h" +#include "common.h" + +struct vk_format { + const char *name; + VkFormat iformat; // vulkan format enum + int components; // how many components are there + int bytes; // how many bytes is a texel + int bits[4]; // how many bits per component + enum ra_ctype ctype; // format representation type + bool fucked_order; // used for formats which are not simply rgba +}; + +extern const struct vk_format vk_formats[]; diff --git a/video/out/vulkan/malloc.c b/video/out/vulkan/malloc.c new file mode 100644 index 0000000000000..cdab6eb590e63 --- /dev/null +++ b/video/out/vulkan/malloc.c @@ -0,0 +1,315 @@ +#include "malloc.h" +#include "utils.h" +#include "osdep/timer.h" + +// Controls how much more space we will allocate than actually necessary. +// Increasing this number increases the amount of memory used in total, but +// decreases the frequency at which slabs need to be allocated and freed. A +// value of 4 means the slabs will be allocated 4 times as large as they need +// to be. +#define MPVK_HEAP_SLAB_OVERCOMMIT 4 + +// Controls the minimum slab size, to avoid overusing small slabs when +// allocating many small slabs. (Default: 1 MB) +#define MPVK_HEAP_MINIMUM_SLAB_SIZE (1 << 20) + +// A single slab represents a contiguous region of allocated memory. Actual +// allocations are served as slices of this. Slabs are organized into linked +// lists, which represent individual heaps. +struct vk_slab { + struct vk_slab *next; // pointer to next vk_slab, or NULL + VkDeviceMemory mem; // underlying device allocation + VkDeviceSize size; // total size of `slab` + VkDeviceSize used; // number of bytes actually in use (for GC accounting) + VkDeviceSize index; // next free byte in `slab` + // optional, depends on the memory type: + VkBuffer buffer; // buffer spanning the entire slab + void *data; // mapped memory corresponding to `mem` +}; + +struct vk_heap { + VkBufferUsageFlagBits usage; // or 0 for generic heaps + struct vk_slab *tip; // linked list of slabs that form this heap +}; + +// Represents a single memory type. All allocations of this memory type are +// grouped together into heaps; one per buffer usage type and one for generic +// allocations (e.g. images). +struct vk_memtype { + int index; // the memory type index + int heapIndex; // the memory heap index + VkMemoryPropertyFlagBits flags; // the memory type bits + struct vk_heap generic_heap; // the heap for generic allocations + // An array of heaps for each possible buffer type (grows dynamically): + // This is an array of sub-allocations, so we can resize the buf_heaps + // array without breaking the vk_heap pointers in memslice.priv. + struct vk_heap **buf_heaps; + int num_buf_heaps; +}; + +// The overall state of the allocator, which keeps track of a vk_heap for each +// memory type supported by the device. +struct vk_malloc { + struct vk_memtype types[VK_MAX_MEMORY_TYPES]; + int num_types; +}; + +void vk_malloc_init(struct mpvk_ctx *vk) +{ + assert(vk->physd); + + struct vk_malloc *ma = vk->alloc = talloc_zero(NULL, struct vk_malloc); + + VkPhysicalDeviceMemoryProperties prop; + vkGetPhysicalDeviceMemoryProperties(vk->physd, &prop); + + ma->num_types = prop.memoryTypeCount; + for (int i = 0; i < prop.memoryTypeCount; i++) { + ma->types[i] = (struct vk_memtype) { + .index = i, + .heapIndex = prop.memoryTypes[i].heapIndex, + .flags = prop.memoryTypes[i].propertyFlags, + }; + } +} + +// "Unlinks" a slab. The slab_ptr is updated to the next link in the chain, +// or NULL if none left. +static void slab_free(struct mpvk_ctx *vk, struct vk_slab **slab_ptr) +{ + struct vk_slab *slab = *slab_ptr; + if (!slab) + return; + + assert(slab->used == 0); + + int64_t start = mp_time_us(); + vkDestroyBuffer(vk->dev, slab->buffer, MPVK_ALLOCATOR); + // also implicitly unmaps the memory if needed + vkFreeMemory(vk->dev, slab->mem, MPVK_ALLOCATOR); + int64_t stop = mp_time_us(); + + MP_VERBOSE(vk, "Freeing slab of size %lu took %ld μs.\n", + slab->size, stop - start); + + *slab_ptr = slab->next; + talloc_free(slab); +} + +static void heap_uninit(struct mpvk_ctx *vk, struct vk_heap *heap) +{ + while (heap->tip) + slab_free(vk, &heap->tip); +} + +void vk_malloc_uninit(struct mpvk_ctx *vk) +{ + struct vk_malloc *ma = vk->alloc; + if (!ma) + return; + + for (int i = 0; i < ma->num_types; i++) { + heap_uninit(vk, &ma->types[i].generic_heap); + for (int b = 0; b < ma->types[i].num_buf_heaps; b++) { + heap_uninit(vk, ma->types[i].buf_heaps[b]); + talloc_free(ma->types[i].buf_heaps[b]); + } + talloc_free(ma->types[i].buf_heaps); + } + + talloc_free(vk->alloc); +} + +// reqs: optional +static struct vk_memtype *find_best_memtype(struct mpvk_ctx *vk, + VkMemoryPropertyFlagBits flags, + VkMemoryRequirements *reqs) +{ + struct vk_malloc *ma = vk->alloc; + + // The vulkan spec requires memory types to be sorted in the "optimal" + // order, so the first matching type we find will be the best/fastest one. + for (int i = 0; i < ma->num_types; i++) { + // The memory type flags must include our properties + if ((ma->types[i].flags & flags) != flags) + continue; + // The memory type must be supported by the requirements (bitfield) + if (reqs && !(reqs->memoryTypeBits & (1 << i))) + continue; + + return &ma->types[i]; + } + + MP_ERR(vk, "Found no memory type matching property flags 0x%x!\n", flags); + return NULL; +} + +// Resizes a heap to make sure we have enough free bytes to serve an allocation +static bool resize_heap(struct mpvk_ctx *vk, struct vk_memtype *type, + struct vk_heap *heap, VkDeviceSize size, + VkDeviceSize align) +{ + // If the tip already exists and is large enough, we can return right away + if (heap->tip) { + if (MP_ALIGN_UP(heap->tip->index, align) + size <= heap->tip->size) + return true; + + // If the tip exists but is not large enough and has no other current + // allocations, free it right away to avoid accumulating garbage. + if (heap->tip->used == 0) + slab_free(vk, &heap->tip); + } + + // Otherwise, allocate a new vk_slab and prepend it to the linked list + struct vk_slab *slab = talloc_ptrtype(NULL, slab); + + VkDeviceSize minSize = MPMAX(MPVK_HEAP_MINIMUM_SLAB_SIZE, + MPVK_HEAP_SLAB_OVERCOMMIT * size); + *slab = (struct vk_slab) { + .next = heap->tip, + .size = heap->tip ? MPMAX(heap->tip->size, minSize) : minSize, + }; + + MP_VERBOSE(vk, "Allocating %lu memory of type 0x%x (id %d) in heap %d.\n", + slab->size, type->flags, type->index, type->heapIndex); + + VkMemoryAllocateInfo minfo = { + .sType = VK_STRUCTURE_TYPE_MEMORY_ALLOCATE_INFO, + .memoryTypeIndex = type->index, + .allocationSize = slab->size, + }; + + if (heap->usage) { + VkBufferCreateInfo binfo = { + .sType = VK_STRUCTURE_TYPE_BUFFER_CREATE_INFO, + .size = slab->size, + .usage = heap->usage, + .sharingMode = VK_SHARING_MODE_EXCLUSIVE, + }; + + VK(vkCreateBuffer(vk->dev, &binfo, MPVK_ALLOCATOR, &slab->buffer)); + + VkMemoryRequirements reqs; + vkGetBufferMemoryRequirements(vk->dev, slab->buffer, &reqs); + minfo.allocationSize = reqs.size; // this can be larger than slab->size + + // Sanity check the memory requirements to make sure we didn't screw up + if (!(reqs.memoryTypeBits & (1 << type->index))) { + MP_ERR(vk, "Chosen memory type %d does not support buffer usage " + "0x%x!\n", type->index, heap->usage); + goto error; + } + } + + VK(vkAllocateMemory(vk->dev, &minfo, MPVK_ALLOCATOR, &slab->mem)); + + if (type->flags & VK_MEMORY_PROPERTY_HOST_VISIBLE_BIT) + VK(vkMapMemory(vk->dev, slab->mem, 0, VK_WHOLE_SIZE, 0, &slab->data)); + + if (heap->usage) + VK(vkBindBufferMemory(vk->dev, slab->buffer, slab->mem, 0)); + + heap->tip = slab; + return true; + +error: + slab_free(vk, &slab); + return false; +} + +void vk_free_memslice(struct mpvk_ctx *vk, struct vk_memslice slice) +{ + struct vk_heap *heap = slice.priv; + + // Find the slab containing this allocation, while also keeping track + // of the pointer to it (so we can unlink it from the list if needed) + struct vk_slab **slab_ptr = &heap->tip; + struct vk_slab *slab = *slab_ptr; + while (slab) { + if (slab->mem == slice.vkmem) + break; + slab_ptr = &slab->next; + slab = *slab_ptr; + } + + assert(slab); + assert(slab->used >= slice.size); + slab->used -= slice.size; + + MP_DBG(vk, "Freeing slice %lu + %lu from slab with size %lu\n", + slice.offset, slice.size, slab->size); + + if (slab->used == 0 && slab != heap->tip) + slab_free(vk, slab_ptr); +} + +static bool slice_heap(struct mpvk_ctx *vk, struct vk_memtype *type, + struct vk_heap *heap, VkDeviceSize size, + VkDeviceSize alignment, struct vk_memslice *out) +{ + if (!resize_heap(vk, type, heap, size, alignment)) + return false; + + struct vk_slab *tip = heap->tip; + assert(tip); + *out = (struct vk_memslice) { + .vkmem = tip->mem, + .offset = MP_ALIGN_UP(tip->index, alignment), + .size = size, + .priv = heap, + }; + + MP_DBG(vk, "Sub-allocating slice %lu + %lu from slab with size %lu\n", + out->offset, out->size, tip->size); + + tip->index = out->offset + size; + tip->used += size; + return true; +} + +bool vk_malloc_generic(struct mpvk_ctx *vk, VkMemoryRequirements reqs, + VkMemoryPropertyFlagBits flags, struct vk_memslice *out) +{ + struct vk_memtype *type = find_best_memtype(vk, flags, &reqs); + if (!type) + return false; + + struct vk_heap *heap = &type->generic_heap; + return slice_heap(vk, type, heap, reqs.size, reqs.alignment, out); +} + +bool vk_malloc_buffer(struct mpvk_ctx *vk, VkBufferUsageFlagBits bufFlags, + VkMemoryPropertyFlagBits memFlags, VkDeviceSize size, + VkDeviceSize alignment, struct vk_bufslice *out) +{ + struct vk_memtype *type = find_best_memtype(vk, memFlags, NULL); + if (!type) + return false; + + struct vk_heap *heap = NULL; + for (int i = 0; i < type->num_buf_heaps; i++) { + if (type->buf_heaps[i]->usage == bufFlags) { + heap = type->buf_heaps[i]; + goto found; + } + } + + // no buffer heap with this type => add it + MP_TARRAY_GROW(NULL, type->buf_heaps, type->num_buf_heaps + 1); + heap = type->buf_heaps[type->num_buf_heaps++] = talloc_ptrtype(NULL, heap); + + *heap = (struct vk_heap) { + .usage = bufFlags, + }; + +found: + if (!slice_heap(vk, type, heap, size, alignment, &out->mem)) + return false; + + struct vk_slab *tip = heap->tip; + out->buf = tip->buffer; + if (tip->data) + out->data = (void *)((uintptr_t)tip->data + (ptrdiff_t)out->mem.offset); + + return true; +} diff --git a/video/out/vulkan/malloc.h b/video/out/vulkan/malloc.h new file mode 100644 index 0000000000000..1963950d54f3a --- /dev/null +++ b/video/out/vulkan/malloc.h @@ -0,0 +1,35 @@ +#pragma once + +#include "common.h" + +void vk_malloc_init(struct mpvk_ctx *vk); +void vk_malloc_uninit(struct mpvk_ctx *vk); + +// Represents a single "slice" of generic (non-buffer) memory, plus some +// metadata for accounting. This struct is essentially read-only. +struct vk_memslice { + VkDeviceMemory vkmem; + VkDeviceSize offset; + VkDeviceSize size; + void *priv; +}; + +void vk_free_memslice(struct mpvk_ctx *vk, struct vk_memslice slice); +bool vk_malloc_generic(struct mpvk_ctx *vk, VkMemoryRequirements reqs, + VkMemoryPropertyFlagBits flags, struct vk_memslice *out); + +// Represents a single "slice" of a larger buffer +struct vk_bufslice { + struct vk_memslice mem; // must be freed by the user when done + VkBuffer buf; // the buffer this memory was sliced from + // For persistently mapped buffers, this points to the first usable byte of + // this slice. + void *data; +}; + +// Allocate a buffer slice. This is more efficient than vk_malloc_generic for +// when the user needs lots of buffers, since it doesn't require +// creating/destroying lots of (little) VkBuffers. +bool vk_malloc_buffer(struct mpvk_ctx *vk, VkBufferUsageFlagBits bufFlags, + VkMemoryPropertyFlagBits memFlags, VkDeviceSize size, + VkDeviceSize alignment, struct vk_bufslice *out); diff --git a/video/out/vulkan/ra_vk.c b/video/out/vulkan/ra_vk.c new file mode 100644 index 0000000000000..853d868a861d6 --- /dev/null +++ b/video/out/vulkan/ra_vk.c @@ -0,0 +1,1588 @@ +#include "ra_vk.h" +#include "malloc.h" +#include "video/out/opengl/utils.h" + +// For ra.priv +struct ra_vk { + struct mpvk_ctx *vk; + struct ra_tex *clear_tex; // stupid hack for clear() + // "Currently recording" command buffer + struct vk_cmd *active_cmd; +}; + +static struct mpvk_ctx *vk_get(struct ra *ra) +{ + struct ra_vk *p = ra->priv; + return p->vk; +} + +static struct vk_cmd *vk_require_cmd(struct ra *ra) +{ + struct ra_vk *p = ra->priv; + struct mpvk_ctx *vk = vk_get(ra); + struct vk_cmdpool *pool = vk->pool; + + if (p->active_cmd) { + assert(p->active_cmd->pool == pool); + return p->active_cmd; + } + + struct vk_cmd *cmd = vk_cmd_begin(vk, pool); + return p->active_cmd = cmd; +} + +// Note: This technically follows the flush() API, but we don't need +// to expose that (and in fact, it's a bad idea) since we control flushing +// behavior with ra_vk_present_frame already. +static void vk_flush(struct ra *ra) +{ + struct ra_vk *p = ra->priv; + struct mpvk_ctx *vk = vk_get(ra); + + if (!p->active_cmd) + return; + + vk_cmd_submit(vk, p->active_cmd, NULL); + p->active_cmd = NULL; +} + +// the callback's *priv will always be set to `ra` +static void vk_callback(struct ra *ra, vk_cb callback, void *arg) +{ + struct ra_vk *p = ra->priv; + struct mpvk_ctx *vk = vk_get(ra); + + if (p->active_cmd) { + vk_cmd_callback(p->active_cmd, callback, ra, arg); + } else { + vk_dev_callback(vk, callback, ra, arg); + } +} + +#define MAKE_LAZY_DESTRUCTOR(fun, argtype) \ + static void fun##_lazy(struct ra *ra, argtype *arg) { \ + vk_callback(ra, (vk_cb) fun, arg); \ + } + +static void vk_destroy_ra(struct ra *ra) +{ + struct ra_vk *p = ra->priv; + struct mpvk_ctx *vk = vk_get(ra); + + vk_flush(ra); + mpvk_wait_idle(vk); + ra_tex_free(ra, &p->clear_tex); + + talloc_free(ra); +} + +static bool vk_setup_formats(struct ra *ra) +{ + struct mpvk_ctx *vk = vk_get(ra); + + for (const struct vk_format *vk_fmt = vk_formats; vk_fmt->name; vk_fmt++) { + VkFormatProperties prop; + vkGetPhysicalDeviceFormatProperties(vk->physd, vk_fmt->iformat, &prop); + + // As a bare minimum, we need to sample from an allocated image + VkFormatFeatureFlags flags = prop.optimalTilingFeatures; + if (!(flags & VK_FORMAT_FEATURE_SAMPLED_IMAGE_BIT)) + continue; + + VkFormatFeatureFlags linear_bits, render_bits; + linear_bits = VK_FORMAT_FEATURE_SAMPLED_IMAGE_FILTER_LINEAR_BIT; + render_bits = VK_FORMAT_FEATURE_COLOR_ATTACHMENT_BIT | + VK_FORMAT_FEATURE_COLOR_ATTACHMENT_BLEND_BIT; + + struct ra_format *fmt = talloc_zero(ra, struct ra_format); + *fmt = (struct ra_format) { + .name = vk_fmt->name, + .priv = (void *)vk_fmt, + .ctype = vk_fmt->ctype, + .ordered = !vk_fmt->fucked_order, + .num_components = vk_fmt->components, + .pixel_size = vk_fmt->bytes, + .linear_filter = !!(flags & linear_bits), + .renderable = !!(flags & render_bits), + }; + + for (int i = 0; i < 4; i++) + fmt->component_size[i] = fmt->component_depth[i] = vk_fmt->bits[i]; + + MP_TARRAY_APPEND(ra, ra->formats, ra->num_formats, fmt); + } + + // Populate some other capabilities related to formats while we're at it + VkImageType imgType[3] = { + VK_IMAGE_TYPE_1D, + VK_IMAGE_TYPE_2D, + VK_IMAGE_TYPE_3D + }; + + // R8_UNORM is supported on literally every single vulkan implementation + const VkFormat testfmt = VK_FORMAT_R8_UNORM; + + for (int d = 0; d < 3; d++) { + VkImageFormatProperties iprop; + VkResult res = vkGetPhysicalDeviceImageFormatProperties(vk->physd, + testfmt, imgType[d], VK_IMAGE_TILING_OPTIMAL, + VK_IMAGE_USAGE_SAMPLED_BIT, 0, &iprop); + + switch (imgType[d]) { + case VK_IMAGE_TYPE_1D: + if (res == VK_SUCCESS) + ra->caps |= RA_CAP_TEX_1D; + break; + case VK_IMAGE_TYPE_2D: + // 2D formats must be supported by RA, so ensure this is the case + VK_ASSERT(res, "Querying 2D format limits"); + ra->max_texture_wh = MPMIN(iprop.maxExtent.width, iprop.maxExtent.height); + break; + case VK_IMAGE_TYPE_3D: + if (res == VK_SUCCESS) + ra->caps |= RA_CAP_TEX_3D; + break; + } + } + + // RA_CAP_BLIT implies both blitting between images as well as blitting + // directly to the swapchain image, so check for all three operations + bool blittable = true; + VkFormatProperties prop; + vkGetPhysicalDeviceFormatProperties(vk->physd, testfmt, &prop); + if (!(prop.optimalTilingFeatures & VK_FORMAT_FEATURE_BLIT_SRC_BIT)) + blittable = false; + if (!(prop.optimalTilingFeatures & VK_FORMAT_FEATURE_BLIT_DST_BIT)) + blittable = false; + + vkGetPhysicalDeviceFormatProperties(vk->physd, vk->surf_format.format, &prop); + if (!(prop.optimalTilingFeatures & VK_FORMAT_FEATURE_BLIT_DST_BIT)) + blittable = false; + + if (blittable) + ra->caps |= RA_CAP_BLIT; + + return true; + +error: + return false; +} + +static struct ra_fns ra_fns_vk; + +struct ra *ra_create_vk(struct mpvk_ctx *vk, struct mp_log *log) +{ + assert(vk->dev); + assert(vk->alloc); + + struct ra *ra = talloc_zero(NULL, struct ra); + ra->log = log; + ra->fns = &ra_fns_vk; + + struct ra_vk *p = ra->priv = talloc_zero(ra, struct ra_vk); + p->vk = vk; + + // There's no way to query the supported GLSL version from VK_NV_glsl_shader + // (thanks nvidia), so just pick the GL version that modern nvidia devices + // support.. + ra->glsl_version = 450; + ra->glsl_vulkan = true; + ra->max_shmem = vk->limits.maxComputeSharedMemorySize; + ra->caps = RA_CAP_NESTED_ARRAY; + + if (vk->pool->props.queueFlags & VK_QUEUE_COMPUTE_BIT) + ra->caps |= RA_CAP_COMPUTE; + + if (!vk_setup_formats(ra)) + goto error; + + // UBO support is required + ra->caps |= RA_CAP_BUF_RO; + + // Try creating a shader storage buffer + struct ra_buf_params ssbo_params = { + .type = RA_BUF_TYPE_SHADER_STORAGE, + .size = 16, + }; + + struct ra_buf *ssbo = ra_buf_create(ra, &ssbo_params); + if (ssbo) { + ra->caps |= RA_CAP_BUF_RW; + ra_buf_free(ra, &ssbo); + } + + // To support clear() by region, we need to allocate a dummy 1x1 image that + // will be used as the source of blit operations + struct ra_tex_params clear_params = { + .dimensions = 1, // no point in using a 2D image if height = 1 + .w = 1, + .h = 1, + .d = 1, + .format = ra_find_float16_format(ra, 4), + .blit_src = 1, + .host_mutable = 1, + }; + + p->clear_tex = ra_tex_create(ra, &clear_params); + if (!p->clear_tex) { + MP_ERR(ra, "Failed creating 1x1 dummy texture for clear()!\n"); + goto error; + } + + return ra; + +error: + vk_destroy_ra(ra); + return NULL; +} + +// Boilerplate wrapper around vkCreateRenderPass to ensure passes remain +// compatible +static VkResult vk_create_render_pass(VkDevice dev, const struct ra_format *fmt, + bool load_fbo, VkRenderPass *out) +{ + struct vk_format *vk_fmt = fmt->priv; + assert(fmt->renderable); + + VkRenderPassCreateInfo rinfo = { + .sType = VK_STRUCTURE_TYPE_RENDER_PASS_CREATE_INFO, + .attachmentCount = 1, + .pAttachments = &(VkAttachmentDescription) { + .format = vk_fmt->iformat, + .samples = VK_SAMPLE_COUNT_1_BIT, + .loadOp = load_fbo ? VK_ATTACHMENT_LOAD_OP_LOAD + : VK_ATTACHMENT_LOAD_OP_DONT_CARE, + .storeOp = VK_ATTACHMENT_STORE_OP_STORE, + .initialLayout = VK_IMAGE_LAYOUT_COLOR_ATTACHMENT_OPTIMAL, + .finalLayout = VK_IMAGE_LAYOUT_COLOR_ATTACHMENT_OPTIMAL, + }, + .subpassCount = 1, + .pSubpasses = &(VkSubpassDescription) { + .pipelineBindPoint = VK_PIPELINE_BIND_POINT_GRAPHICS, + .colorAttachmentCount = 1, + .pColorAttachments = &(VkAttachmentReference) { + .attachment = 0, + .layout = VK_IMAGE_LAYOUT_COLOR_ATTACHMENT_OPTIMAL, + }, + }, + }; + + return vkCreateRenderPass(dev, &rinfo, MPVK_ALLOCATOR, out); +} + +// For ra_tex.priv +struct ra_tex_vk { + bool external_img; + VkImageType type; + VkImage img; + struct vk_memslice mem; + // for sampling + VkImageView view; + VkSampler sampler; + // for rendering + VkFramebuffer framebuffer; + VkRenderPass dummyPass; + // for uploading + struct ra_buf_pool pbo; + // "current" metadata, can change during the course of execution + VkImageLayout current_layout; + VkPipelineStageFlagBits current_stage; + VkAccessFlagBits current_access; +}; + +// Small helper to ease image barrier creation. if `discard` is set, the contents +// of the image will be undefined after the barrier +static void tex_barrier(struct vk_cmd *cmd, struct ra_tex_vk *tex_vk, + VkPipelineStageFlagBits newStage, + VkAccessFlagBits newAccess, VkImageLayout newLayout, + bool discard) +{ + VkImageMemoryBarrier imgBarrier = { + .sType = VK_STRUCTURE_TYPE_IMAGE_MEMORY_BARRIER, + .oldLayout = tex_vk->current_layout, + .newLayout = newLayout, + .srcQueueFamilyIndex = VK_QUEUE_FAMILY_IGNORED, + .dstQueueFamilyIndex = VK_QUEUE_FAMILY_IGNORED, + .srcAccessMask = tex_vk->current_access, + .dstAccessMask = newAccess, + .image = tex_vk->img, + .subresourceRange = vk_range, + }; + + if (discard) { + imgBarrier.oldLayout = VK_IMAGE_LAYOUT_UNDEFINED; + imgBarrier.srcAccessMask = 0; + } + + vkCmdPipelineBarrier(cmd->buf, tex_vk->current_stage, newStage, 0, + 0, NULL, 0, NULL, 1, &imgBarrier); + + tex_vk->current_stage = newStage; + tex_vk->current_layout = newLayout; + tex_vk->current_access = newAccess; +} + +static void vk_tex_destroy(struct ra *ra, struct ra_tex *tex) +{ + if (!tex) + return; + + struct mpvk_ctx *vk = vk_get(ra); + struct ra_tex_vk *tex_vk = tex->priv; + + ra_buf_pool_uninit(ra, &tex_vk->pbo); + vkDestroyFramebuffer(vk->dev, tex_vk->framebuffer, MPVK_ALLOCATOR); + vkDestroyRenderPass(vk->dev, tex_vk->dummyPass, MPVK_ALLOCATOR); + vkDestroySampler(vk->dev, tex_vk->sampler, MPVK_ALLOCATOR); + vkDestroyImageView(vk->dev, tex_vk->view, MPVK_ALLOCATOR); + if (!tex_vk->external_img) { + vkDestroyImage(vk->dev, tex_vk->img, MPVK_ALLOCATOR); + vk_free_memslice(vk, tex_vk->mem); + } + + talloc_free(tex); +} + +MAKE_LAZY_DESTRUCTOR(vk_tex_destroy, struct ra_tex); + +// Initializes non-VkImage values like the image view, samplers, etc. +static bool vk_init_image(struct ra *ra, struct ra_tex *tex) +{ + struct mpvk_ctx *vk = vk_get(ra); + + struct ra_tex_params *params = &tex->params; + struct ra_tex_vk *tex_vk = tex->priv; + assert(tex_vk->img); + + tex_vk->current_layout = VK_IMAGE_LAYOUT_UNDEFINED; + tex_vk->current_stage = VK_PIPELINE_STAGE_TOP_OF_PIPE_BIT; + tex_vk->current_access = 0; + + if (params->render_src || params->render_dst) { + static const VkImageViewType viewType[] = { + [VK_IMAGE_TYPE_1D] = VK_IMAGE_VIEW_TYPE_1D, + [VK_IMAGE_TYPE_2D] = VK_IMAGE_VIEW_TYPE_2D, + [VK_IMAGE_TYPE_3D] = VK_IMAGE_VIEW_TYPE_3D, + }; + + const struct vk_format *fmt = params->format->priv; + VkImageViewCreateInfo vinfo = { + .sType = VK_STRUCTURE_TYPE_IMAGE_VIEW_CREATE_INFO, + .image = tex_vk->img, + .viewType = viewType[tex_vk->type], + .format = fmt->iformat, + .subresourceRange = vk_range, + }; + + VK(vkCreateImageView(vk->dev, &vinfo, MPVK_ALLOCATOR, &tex_vk->view)); + } + + if (params->render_src) { + assert(params->format->linear_filter || !params->src_linear); + VkFilter filter = params->src_linear + ? VK_FILTER_LINEAR + : VK_FILTER_NEAREST; + VkSamplerAddressMode wrap = params->src_repeat + ? VK_SAMPLER_ADDRESS_MODE_REPEAT + : VK_SAMPLER_ADDRESS_MODE_CLAMP_TO_EDGE; + VkSamplerCreateInfo sinfo = { + .sType = VK_STRUCTURE_TYPE_SAMPLER_CREATE_INFO, + .magFilter = filter, + .minFilter = filter, + .addressModeU = wrap, + .addressModeV = wrap, + .addressModeW = wrap, + .maxAnisotropy = 1.0, + }; + + VK(vkCreateSampler(vk->dev, &sinfo, MPVK_ALLOCATOR, &tex_vk->sampler)); + } + + if (params->render_dst) { + // Framebuffers need to be created against a specific render pass + // layout, so we need to temporarily create a skeleton/dummy render + // pass for vulkan to figure out the compatibility + VK(vk_create_render_pass(vk->dev, params->format, false, &tex_vk->dummyPass)); + + VkFramebufferCreateInfo finfo = { + .sType = VK_STRUCTURE_TYPE_FRAMEBUFFER_CREATE_INFO, + .renderPass = tex_vk->dummyPass, + .attachmentCount = 1, + .pAttachments = &tex_vk->view, + .width = tex->params.w, + .height = tex->params.h, + .layers = 1, + }; + + VK(vkCreateFramebuffer(vk->dev, &finfo, MPVK_ALLOCATOR, + &tex_vk->framebuffer)); + + // NOTE: Normally we would free the dummyPass again here, but a bug + // in the nvidia vulkan driver causes a segfault if you do. + } + + return true; + +error: + return false; +} + +static struct ra_tex *vk_tex_create(struct ra *ra, + const struct ra_tex_params *params) +{ + struct mpvk_ctx *vk = vk_get(ra); + + struct ra_tex *tex = talloc_zero(NULL, struct ra_tex); + tex->params = *params; + tex->params.initial_data = NULL; + + struct ra_tex_vk *tex_vk = tex->priv = talloc_zero(tex, struct ra_tex_vk); + + const struct vk_format *fmt = params->format->priv; + switch (params->dimensions) { + case 1: tex_vk->type = VK_IMAGE_TYPE_1D; break; + case 2: tex_vk->type = VK_IMAGE_TYPE_2D; break; + case 3: tex_vk->type = VK_IMAGE_TYPE_3D; break; + default: abort(); + } + + VkImageUsageFlags usage = 0; + if (params->render_src) + usage |= VK_IMAGE_USAGE_SAMPLED_BIT; + if (params->render_dst) + usage |= VK_IMAGE_USAGE_COLOR_ATTACHMENT_BIT; + if (params->storage_dst) + usage |= VK_IMAGE_USAGE_STORAGE_BIT; + if (params->blit_src) + usage |= VK_IMAGE_USAGE_TRANSFER_SRC_BIT; + if (params->host_mutable || params->blit_dst || params->initial_data) + usage |= VK_IMAGE_USAGE_TRANSFER_DST_BIT; + + // Double-check image usage support and fail immediately if invalid + VkImageFormatProperties iprop; + VkResult res = vkGetPhysicalDeviceImageFormatProperties(vk->physd, + fmt->iformat, tex_vk->type, VK_IMAGE_TILING_OPTIMAL, usage, 0, + &iprop); + if (res == VK_ERROR_FORMAT_NOT_SUPPORTED) { + return NULL; + } else { + VK_ASSERT(res, "Querying image format properties"); + } + + VkFormatProperties prop; + vkGetPhysicalDeviceFormatProperties(vk->physd, fmt->iformat, &prop); + VkFormatFeatureFlags flags = prop.optimalTilingFeatures; + + bool has_blit_src = flags & VK_FORMAT_FEATURE_BLIT_SRC_BIT, + has_src_linear = flags & VK_FORMAT_FEATURE_SAMPLED_IMAGE_FILTER_LINEAR_BIT; + + if (params->w > iprop.maxExtent.width || + params->h > iprop.maxExtent.height || + params->d > iprop.maxExtent.depth || + (params->blit_src && !has_blit_src) || + (params->src_linear && !has_src_linear)) + { + return NULL; + } + + VkImageCreateInfo iinfo = { + .sType = VK_STRUCTURE_TYPE_IMAGE_CREATE_INFO, + .imageType = tex_vk->type, + .format = fmt->iformat, + .extent = (VkExtent3D) { params->w, params->h, params->d }, + .mipLevels = 1, + .arrayLayers = 1, + .samples = VK_SAMPLE_COUNT_1_BIT, + .tiling = VK_IMAGE_TILING_OPTIMAL, + .usage = usage, + .sharingMode = VK_SHARING_MODE_EXCLUSIVE, + .initialLayout = VK_IMAGE_LAYOUT_UNDEFINED, + }; + + VK(vkCreateImage(vk->dev, &iinfo, MPVK_ALLOCATOR, &tex_vk->img)); + + VkMemoryPropertyFlagBits memFlags = VK_MEMORY_PROPERTY_DEVICE_LOCAL_BIT; + VkMemoryRequirements reqs; + vkGetImageMemoryRequirements(vk->dev, tex_vk->img, &reqs); + + struct vk_memslice *mem = &tex_vk->mem; + if (!vk_malloc_generic(vk, reqs, memFlags, mem)) + goto error; + + VK(vkBindImageMemory(vk->dev, tex_vk->img, mem->vkmem, mem->offset)); + + if (!vk_init_image(ra, tex)) + goto error; + + if (params->initial_data) { + struct ra_tex_upload_params ul_params = { + .tex = tex, + .invalidate = true, + .src = params->initial_data, + .stride = params->w * fmt->bytes, + }; + if (!ra->fns->tex_upload(ra, &ul_params)) + goto error; + } + + return tex; + +error: + vk_tex_destroy(ra, tex); + return NULL; +} + +struct ra_tex *ra_vk_wrap_swchain_img(struct ra *ra, VkImage vkimg, + VkSwapchainCreateInfoKHR info) +{ + struct mpvk_ctx *vk = vk_get(ra); + struct ra_tex *tex = NULL; + + const struct ra_format *format = NULL; + for (int i = 0; i < ra->num_formats; i++) { + const struct vk_format *fmt = ra->formats[i]->priv; + if (fmt->iformat == vk->surf_format.format) { + format = ra->formats[i]; + break; + } + } + + if (!format) { + MP_ERR(ra, "Could not find ra_format suitable for wrapped swchain image " + "with surface format %d\n", vk->surf_format.format); + goto error; + } + + tex = talloc_zero(NULL, struct ra_tex); + tex->params = (struct ra_tex_params) { + .format = format, + .dimensions = 2, + .w = info.imageExtent.width, + .h = info.imageExtent.height, + .d = 1, + .blit_src = !!(info.imageUsage & VK_IMAGE_USAGE_TRANSFER_SRC_BIT), + .blit_dst = !!(info.imageUsage & VK_IMAGE_USAGE_TRANSFER_DST_BIT), + .render_src = !!(info.imageUsage & VK_IMAGE_USAGE_SAMPLED_BIT), + .render_dst = !!(info.imageUsage & VK_IMAGE_USAGE_COLOR_ATTACHMENT_BIT), + }; + + struct ra_tex_vk *tex_vk = tex->priv = talloc_zero(tex, struct ra_tex_vk); + tex_vk->type = VK_IMAGE_TYPE_2D; + tex_vk->external_img = true; + tex_vk->img = vkimg; + + if (!vk_init_image(ra, tex)) + goto error; + + return tex; + +error: + vk_tex_destroy(ra, tex); + return NULL; +} + +// For ra_buf.priv +struct ra_buf_vk { + struct vk_bufslice slice; + bool inuse; + bool needsflush; + // "current" metadata, can change during course of execution + VkPipelineStageFlagBits current_stage; + VkAccessFlagBits current_access; +}; + +static void buf_free_to_use(void *priv, struct ra_buf_vk *buf_vk) +{ + buf_vk->inuse = false; +} + +static void buf_barrier(struct vk_cmd *cmd, struct ra_buf *buf, + VkPipelineStageFlagBits newStage, + VkAccessFlagBits newAccess, int offset, size_t size) +{ + struct ra_buf_vk *buf_vk = buf->priv; + + VkBufferMemoryBarrier buffBarrier = { + .sType = VK_STRUCTURE_TYPE_BUFFER_MEMORY_BARRIER, + .srcAccessMask = buf_vk->current_access, + .dstAccessMask = newAccess, + .buffer = buf_vk->slice.buf, + .offset = offset, + .size = size, + }; + + if (buf_vk->needsflush || buf->params.host_mapped) { + buffBarrier.srcAccessMask = VK_ACCESS_HOST_WRITE_BIT; + buf_vk->current_stage = VK_PIPELINE_STAGE_HOST_BIT; + buf_vk->needsflush = false; + } + + vkCmdPipelineBarrier(cmd->buf, buf_vk->current_stage, newStage, 0, + 0, NULL, 1, &buffBarrier, 0, NULL); + + buf_vk->current_stage = newStage; + buf_vk->current_access = newAccess; + buf_vk->inuse = true; + + vk_cmd_callback(cmd, (vk_cb) buf_free_to_use, NULL, buf_vk); +} + +static void vk_buf_destroy(struct ra *ra, struct ra_buf *buf) +{ + if (!buf) + return; + + struct mpvk_ctx *vk = vk_get(ra); + struct ra_buf_vk *buf_vk = buf->priv; + + if (buf_vk->slice.buf) + vk_free_memslice(vk, buf_vk->slice.mem); + + talloc_free(buf); +} + +MAKE_LAZY_DESTRUCTOR(vk_buf_destroy, struct ra_buf); + +static void vk_buf_update(struct ra *ra, struct ra_buf *buf, ptrdiff_t offset, + const void *data, size_t size) +{ + assert(buf->params.host_mutable || buf->params.initial_data); + struct ra_buf_vk *buf_vk = buf->priv; + + // For host-mapped buffers, we can just directly memcpy the buffer contents. + // Otherwise, we can update the buffer from the GPU using a command buffer. + if (buf_vk->slice.data) { + assert(offset + size <= buf->params.size); + uintptr_t addr = (uintptr_t)buf_vk->slice.data + offset; + memcpy((void *)addr, data, size); + buf_vk->needsflush = true; + } else { + struct vk_cmd *cmd = vk_require_cmd(ra); + if (!cmd) { + MP_ERR(ra, "Failed updating buffer!\n"); + return; + } + + VkDeviceSize bufOffset = buf_vk->slice.mem.offset + offset; + assert(bufOffset == MP_ALIGN_UP(bufOffset, 4)); + vkCmdUpdateBuffer(cmd->buf, buf_vk->slice.buf, bufOffset, size, data); + } +} + +static struct ra_buf *vk_buf_create(struct ra *ra, + const struct ra_buf_params *params) +{ + struct mpvk_ctx *vk = vk_get(ra); + + struct ra_buf *buf = talloc_zero(NULL, struct ra_buf); + buf->params = *params; + + struct ra_buf_vk *buf_vk = buf->priv = talloc_zero(buf, struct ra_buf_vk); + buf_vk->current_stage = VK_PIPELINE_STAGE_TOP_OF_PIPE_BIT; + buf_vk->current_access = 0; + + VkBufferUsageFlagBits bufFlags = 0; + VkMemoryPropertyFlagBits memFlags = 0; + VkDeviceSize align = 4; // alignment 4 is needed for buf_update + + switch (params->type) { + case RA_BUF_TYPE_TEX_UPLOAD: + bufFlags |= VK_BUFFER_USAGE_TRANSFER_SRC_BIT; + memFlags |= VK_MEMORY_PROPERTY_HOST_VISIBLE_BIT; + break; + case RA_BUF_TYPE_UNIFORM: + bufFlags |= VK_BUFFER_USAGE_UNIFORM_BUFFER_BIT; + memFlags |= VK_MEMORY_PROPERTY_DEVICE_LOCAL_BIT; + align = MP_ALIGN_UP(align, vk->limits.minUniformBufferOffsetAlignment); + break; + case RA_BUF_TYPE_SHADER_STORAGE: + bufFlags |= VK_BUFFER_USAGE_STORAGE_BUFFER_BIT; + memFlags |= VK_MEMORY_PROPERTY_DEVICE_LOCAL_BIT; + align = MP_ALIGN_UP(align, vk->limits.minStorageBufferOffsetAlignment); + break; + case RA_BUF_TYPE_VERTEX: + bufFlags |= VK_BUFFER_USAGE_VERTEX_BUFFER_BIT; + memFlags |= VK_MEMORY_PROPERTY_DEVICE_LOCAL_BIT; + break; + default: abort(); + } + + if (params->host_mutable || params->initial_data) { + bufFlags |= VK_BUFFER_USAGE_TRANSFER_DST_BIT; + align = MP_ALIGN_UP(align, vk->limits.optimalBufferCopyOffsetAlignment); + } + + if (params->host_mapped) { + memFlags |= VK_MEMORY_PROPERTY_HOST_VISIBLE_BIT | + VK_MEMORY_PROPERTY_HOST_COHERENT_BIT | + VK_MEMORY_PROPERTY_HOST_CACHED_BIT; + } + + if (!vk_malloc_buffer(vk, bufFlags, memFlags, params->size, align, + &buf_vk->slice)) + { + goto error; + } + + if (params->host_mapped) + buf->data = buf_vk->slice.data; + + if (params->initial_data) + vk_buf_update(ra, buf, 0, params->initial_data, params->size); + + buf->params.initial_data = NULL; // do this after vk_buf_update + return buf; + +error: + vk_buf_destroy(ra, buf); + return NULL; +} + +static bool vk_buf_poll(struct ra *ra, struct ra_buf *buf) +{ + struct ra_buf_vk *buf_vk = buf->priv; + return !buf_vk->inuse; +} + +static bool vk_tex_upload(struct ra *ra, + const struct ra_tex_upload_params *params) +{ + + struct ra_tex *tex = params->tex; + struct ra_tex_vk *tex_vk = tex->priv; + + if (!params->buf) + return ra_tex_upload_pbo(ra, &tex_vk->pbo, params); + + assert(!params->src); + assert(params->buf); + struct ra_buf *buf = params->buf; + struct ra_buf_vk *buf_vk = buf->priv; + + VkBufferImageCopy region = { + .bufferOffset = buf_vk->slice.mem.offset + params->buf_offset, + .bufferRowLength = tex->params.w, + .bufferImageHeight = tex->params.h, + .imageSubresource = vk_layers, + .imageExtent = (VkExtent3D){tex->params.w, tex->params.h, tex->params.d}, + }; + + if (tex->params.dimensions == 2) { + int pix_size = tex->params.format->pixel_size; + region.bufferRowLength = params->stride / pix_size; + if (region.bufferRowLength * pix_size != params->stride) { + MP_ERR(ra, "Texture upload strides must be a multiple of the texel " + "size!\n"); + goto error; + } + + if (params->rc) { + struct mp_rect *rc = params->rc; + region.imageOffset = (VkOffset3D){rc->x0, rc->y0, 0}; + region.imageExtent = (VkExtent3D){mp_rect_w(*rc), mp_rect_h(*rc), 1}; + } + } + + uint64_t size = region.bufferRowLength * region.bufferImageHeight * + region.imageExtent.depth; + + struct vk_cmd *cmd = vk_require_cmd(ra); + if (!cmd) + goto error; + + buf_barrier(cmd, buf, VK_PIPELINE_STAGE_TRANSFER_BIT, + VK_ACCESS_TRANSFER_READ_BIT, region.bufferOffset, size); + + tex_barrier(cmd, tex_vk, VK_PIPELINE_STAGE_TRANSFER_BIT, + VK_ACCESS_TRANSFER_WRITE_BIT, + VK_IMAGE_LAYOUT_TRANSFER_DST_OPTIMAL, + params->invalidate); + + vkCmdCopyBufferToImage(cmd->buf, buf_vk->slice.buf, tex_vk->img, + tex_vk->current_layout, 1, ®ion); + + return true; + +error: + return false; +} + +#define MPVK_NUM_DS MPVK_MAX_STREAMING_DEPTH + +// For ra_renderpass.priv +struct ra_renderpass_vk { + // Compiled shaders + VkShaderModule vert; + VkShaderModule frag; + VkShaderModule comp; + // Pipeline / render pass + VkPipeline pipe; + VkPipelineLayout pipeLayout; + VkPipelineCache pipeCache; + VkRenderPass renderPass; + // Descriptor set (bindings) + VkDescriptorSetLayout dsLayout; + VkDescriptorPool dsPool; + VkDescriptorSet dss[MPVK_NUM_DS]; + int dindex; + // Vertex buffers (vertices) + struct ra_buf_pool vbo; + + // For updating + VkWriteDescriptorSet *dswrite; + VkDescriptorImageInfo *dsiinfo; + VkDescriptorBufferInfo *dsbinfo; +}; + +static void vk_renderpass_destroy(struct ra *ra, struct ra_renderpass *pass) +{ + if (!pass) + return; + + struct mpvk_ctx *vk = vk_get(ra); + struct ra_renderpass_vk *pass_vk = pass->priv; + + ra_buf_pool_uninit(ra, &pass_vk->vbo); + vkDestroyPipeline(vk->dev, pass_vk->pipe, MPVK_ALLOCATOR); + vkDestroyPipelineCache(vk->dev, pass_vk->pipeCache, MPVK_ALLOCATOR); + vkDestroyRenderPass(vk->dev, pass_vk->renderPass, MPVK_ALLOCATOR); + vkDestroyPipelineLayout(vk->dev, pass_vk->pipeLayout, MPVK_ALLOCATOR); + vkDestroyDescriptorPool(vk->dev, pass_vk->dsPool, MPVK_ALLOCATOR); + vkDestroyDescriptorSetLayout(vk->dev, pass_vk->dsLayout, MPVK_ALLOCATOR); + vkDestroyShaderModule(vk->dev, pass_vk->vert, MPVK_ALLOCATOR); + vkDestroyShaderModule(vk->dev, pass_vk->frag, MPVK_ALLOCATOR); + vkDestroyShaderModule(vk->dev, pass_vk->comp, MPVK_ALLOCATOR); + + talloc_free(pass); +} + +MAKE_LAZY_DESTRUCTOR(vk_renderpass_destroy, struct ra_renderpass); + +static const VkDescriptorType dsType[] = { + [RA_VARTYPE_TEX] = VK_DESCRIPTOR_TYPE_COMBINED_IMAGE_SAMPLER, + [RA_VARTYPE_IMG_W] = VK_DESCRIPTOR_TYPE_STORAGE_IMAGE, + [RA_VARTYPE_BUF_RO] = VK_DESCRIPTOR_TYPE_UNIFORM_BUFFER, + [RA_VARTYPE_BUF_RW] = VK_DESCRIPTOR_TYPE_STORAGE_BUFFER, +}; + +static bool vk_get_input_format(struct ra *ra, struct ra_renderpass_input *inp, + VkFormat *out_fmt) +{ + struct mpvk_ctx *vk = vk_get(ra); + + enum ra_ctype ctype; + switch (inp->type) { + case RA_VARTYPE_FLOAT: ctype = RA_CTYPE_FLOAT; break; + case RA_VARTYPE_BYTE_UNORM: ctype = RA_CTYPE_UNORM; break; + default: abort(); + } + + assert(inp->dim_m == 1); + for (const struct vk_format *fmt = vk_formats; fmt->name; fmt++) { + if (fmt->ctype != ctype) + continue; + if (fmt->components != inp->dim_v) + continue; + if (fmt->bytes != ra_renderpass_input_layout(inp).size) + continue; + + // Ensure this format is valid for vertex attributes + VkFormatProperties prop; + vkGetPhysicalDeviceFormatProperties(vk->physd, fmt->iformat, &prop); + if (!(prop.bufferFeatures & VK_FORMAT_FEATURE_VERTEX_BUFFER_BIT)) + continue; + + *out_fmt = fmt->iformat; + return true; + } + + return false; +} + +static const VkPipelineStageFlagBits stageFlags[] = { + [RA_RENDERPASS_TYPE_RASTER] = VK_SHADER_STAGE_FRAGMENT_BIT, + [RA_RENDERPASS_TYPE_COMPUTE] = VK_SHADER_STAGE_COMPUTE_BIT, +}; + +static struct ra_renderpass *vk_renderpass_create(struct ra *ra, + const struct ra_renderpass_params *params) +{ + struct mpvk_ctx *vk = vk_get(ra); + + struct ra_renderpass *pass = talloc_zero(NULL, struct ra_renderpass); + pass->params = *ra_renderpass_params_copy(pass, params); + pass->params.cached_program = (bstr){0}; + struct ra_renderpass_vk *pass_vk = pass->priv = + talloc_zero(pass, struct ra_renderpass_vk); + + static int dsCount[RA_VARTYPE_COUNT] = {0}; + VkDescriptorSetLayoutBinding *bindings = NULL; + int num_bindings = 0; + + for (int i = 0; i < params->num_inputs; i++) { + struct ra_renderpass_input *inp = ¶ms->inputs[i]; + switch (inp->type) { + case RA_VARTYPE_TEX: + case RA_VARTYPE_IMG_W: + case RA_VARTYPE_BUF_RO: + case RA_VARTYPE_BUF_RW: { + VkDescriptorSetLayoutBinding desc = { + .binding = inp->binding, + .descriptorType = dsType[inp->type], + .descriptorCount = 1, + .stageFlags = stageFlags[params->type], + }; + + MP_TARRAY_APPEND(pass, bindings, num_bindings, desc); + dsCount[inp->type]++; + break; + } + default: abort(); + } + } + + VkDescriptorPoolSize *dsPoolSizes = NULL; + int poolSizeCount = 0; + for (enum ra_vartype t = 0; t < RA_VARTYPE_COUNT; t++) { + if (dsCount[t] > 0) { + VkDescriptorPoolSize dssize = { + .type = dsType[t], + .descriptorCount = dsCount[t] * MPVK_NUM_DS, + }; + + MP_TARRAY_APPEND(pass, dsPoolSizes, poolSizeCount, dssize); + } + } + + VkDescriptorPoolCreateInfo pinfo = { + .sType = VK_STRUCTURE_TYPE_DESCRIPTOR_POOL_CREATE_INFO, + .maxSets = MPVK_NUM_DS, + .pPoolSizes = dsPoolSizes, + .poolSizeCount = poolSizeCount, + }; + + VK(vkCreateDescriptorPool(vk->dev, &pinfo, MPVK_ALLOCATOR, &pass_vk->dsPool)); + talloc_free(dsPoolSizes); + + pass_vk->dswrite = talloc_array(pass, VkWriteDescriptorSet, num_bindings); + pass_vk->dsiinfo = talloc_array(pass, VkDescriptorImageInfo, num_bindings); + pass_vk->dsbinfo = talloc_array(pass, VkDescriptorBufferInfo, num_bindings); + + VkDescriptorSetLayoutCreateInfo dinfo = { + .sType = VK_STRUCTURE_TYPE_DESCRIPTOR_SET_LAYOUT_CREATE_INFO, + .pBindings = bindings, + .bindingCount = num_bindings, + }; + + VK(vkCreateDescriptorSetLayout(vk->dev, &dinfo, MPVK_ALLOCATOR, + &pass_vk->dsLayout)); + + VkDescriptorSetAllocateInfo ainfo = { + .sType = VK_STRUCTURE_TYPE_DESCRIPTOR_SET_ALLOCATE_INFO, + .descriptorPool = pass_vk->dsPool, + .descriptorSetCount = 1, + .pSetLayouts = &pass_vk->dsLayout, + }; + + for (int i = 0; i < MPVK_NUM_DS; i++) + VK(vkAllocateDescriptorSets(vk->dev, &ainfo, &pass_vk->dss[i])); + + VkPipelineLayoutCreateInfo linfo = { + .sType = VK_STRUCTURE_TYPE_PIPELINE_LAYOUT_CREATE_INFO, + .setLayoutCount = 1, + .pSetLayouts = &pass_vk->dsLayout, + }; + + VK(vkCreatePipelineLayout(vk->dev, &linfo, MPVK_ALLOCATOR, + &pass_vk->pipeLayout)); + + VkPipelineCacheCreateInfo pcinfo = { + .sType = VK_STRUCTURE_TYPE_PIPELINE_CACHE_CREATE_INFO, + .pInitialData = params->cached_program.start, + .initialDataSize = params->cached_program.len, + }; + + VK(vkCreatePipelineCache(vk->dev, &pcinfo, MPVK_ALLOCATOR, &pass_vk->pipeCache)); + + VkShaderModuleCreateInfo sinfo = { + .sType = VK_STRUCTURE_TYPE_SHADER_MODULE_CREATE_INFO, + }; + + switch (params->type) { + case RA_RENDERPASS_TYPE_RASTER: { + sinfo.pCode = (uint32_t *)params->vertex_shader; + sinfo.codeSize = strlen(params->vertex_shader); + VK(vkCreateShaderModule(vk->dev, &sinfo, MPVK_ALLOCATOR, &pass_vk->vert)); + + sinfo.pCode = (uint32_t *)params->frag_shader; + sinfo.codeSize = strlen(params->frag_shader); + VK(vkCreateShaderModule(vk->dev, &sinfo, MPVK_ALLOCATOR, &pass_vk->frag)); + + VK(vk_create_render_pass(vk->dev, params->target_format, + params->enable_blend, &pass_vk->renderPass)); + + VkPipelineShaderStageCreateInfo stages[] = { + { + .sType = VK_STRUCTURE_TYPE_PIPELINE_SHADER_STAGE_CREATE_INFO, + .stage = VK_SHADER_STAGE_VERTEX_BIT, + .module = pass_vk->vert, + .pName = "main", + }, + { + .sType = VK_STRUCTURE_TYPE_PIPELINE_SHADER_STAGE_CREATE_INFO, + .stage = VK_SHADER_STAGE_FRAGMENT_BIT, + .module = pass_vk->frag, + .pName = "main", + } + }; + + VkVertexInputAttributeDescription *attrs = talloc_array(pass, + VkVertexInputAttributeDescription, params->num_vertex_attribs); + + for (int i = 0; i < params->num_vertex_attribs; i++) { + struct ra_renderpass_input *inp = ¶ms->vertex_attribs[i]; + attrs[i] = (VkVertexInputAttributeDescription) { + .location = i, + .binding = 0, + .offset = inp->offset, + }; + + if (!vk_get_input_format(ra, inp, &attrs[i].format)) { + MP_ERR(ra, "No suitable VkFormat for vertex attrib '%s'!\n", + inp->name); + goto error; + } + } + + static const VkBlendFactor blendFactors[] = { + [RA_BLEND_ZERO] = VK_BLEND_FACTOR_ZERO, + [RA_BLEND_ONE] = VK_BLEND_FACTOR_ONE, + [RA_BLEND_SRC_ALPHA] = VK_BLEND_FACTOR_SRC_ALPHA, + [RA_BLEND_ONE_MINUS_SRC_ALPHA] = VK_BLEND_FACTOR_ONE_MINUS_SRC_ALPHA, + }; + + VkPipelineColorBlendAttachmentState binfo = { + .blendEnable = params->enable_blend, + .colorBlendOp = VK_BLEND_OP_ADD, + .srcColorBlendFactor = blendFactors[params->blend_src_rgb], + .dstColorBlendFactor = blendFactors[params->blend_dst_rgb], + .alphaBlendOp = VK_BLEND_OP_ADD, + .srcAlphaBlendFactor = blendFactors[params->blend_src_alpha], + .dstAlphaBlendFactor = blendFactors[params->blend_dst_alpha], + .colorWriteMask = VK_COLOR_COMPONENT_R_BIT | + VK_COLOR_COMPONENT_G_BIT | + VK_COLOR_COMPONENT_B_BIT | + VK_COLOR_COMPONENT_A_BIT, + }; + + VkGraphicsPipelineCreateInfo cinfo = { + .sType = VK_STRUCTURE_TYPE_GRAPHICS_PIPELINE_CREATE_INFO, + .stageCount = MP_ARRAY_SIZE(stages), + .pStages = &stages[0], + .pVertexInputState = &(VkPipelineVertexInputStateCreateInfo) { + .sType = VK_STRUCTURE_TYPE_PIPELINE_VERTEX_INPUT_STATE_CREATE_INFO, + .vertexBindingDescriptionCount = 1, + .pVertexBindingDescriptions = &(VkVertexInputBindingDescription) { + .binding = 0, + .stride = params->vertex_stride, + .inputRate = VK_VERTEX_INPUT_RATE_VERTEX, + }, + .vertexAttributeDescriptionCount = params->num_vertex_attribs, + .pVertexAttributeDescriptions = attrs, + }, + .pInputAssemblyState = &(VkPipelineInputAssemblyStateCreateInfo) { + .sType = VK_STRUCTURE_TYPE_PIPELINE_INPUT_ASSEMBLY_STATE_CREATE_INFO, + .topology = VK_PRIMITIVE_TOPOLOGY_TRIANGLE_LIST, + }, + .pViewportState = &(VkPipelineViewportStateCreateInfo) { + .sType = VK_STRUCTURE_TYPE_PIPELINE_VIEWPORT_STATE_CREATE_INFO, + .viewportCount = 1, + .scissorCount = 1, + }, + .pRasterizationState = &(VkPipelineRasterizationStateCreateInfo) { + .sType = VK_STRUCTURE_TYPE_PIPELINE_RASTERIZATION_STATE_CREATE_INFO, + .polygonMode = VK_POLYGON_MODE_FILL, + .cullMode = VK_CULL_MODE_NONE, + .lineWidth = 1.0f, + }, + .pMultisampleState = &(VkPipelineMultisampleStateCreateInfo) { + .sType = VK_STRUCTURE_TYPE_PIPELINE_MULTISAMPLE_STATE_CREATE_INFO, + .rasterizationSamples = VK_SAMPLE_COUNT_1_BIT, + }, + .pColorBlendState = &(VkPipelineColorBlendStateCreateInfo) { + .sType = VK_STRUCTURE_TYPE_PIPELINE_COLOR_BLEND_STATE_CREATE_INFO, + .attachmentCount = 1, + .pAttachments = &binfo, + }, + .pDynamicState = &(VkPipelineDynamicStateCreateInfo) { + .sType = VK_STRUCTURE_TYPE_PIPELINE_DYNAMIC_STATE_CREATE_INFO, + .dynamicStateCount = 2, + .pDynamicStates = (VkDynamicState[]){ + VK_DYNAMIC_STATE_VIEWPORT, + VK_DYNAMIC_STATE_SCISSOR, + }, + }, + .layout = pass_vk->pipeLayout, + .renderPass = pass_vk->renderPass, + }; + + VK(vkCreateGraphicsPipelines(vk->dev, pass_vk->pipeCache, 1, &cinfo, + MPVK_ALLOCATOR, &pass_vk->pipe)); + break; + } + case RA_RENDERPASS_TYPE_COMPUTE: { + sinfo.pCode = (uint32_t *)params->compute_shader; + sinfo.codeSize = strlen(params->compute_shader); + VK(vkCreateShaderModule(vk->dev, &sinfo, MPVK_ALLOCATOR, &pass_vk->comp)); + + VkComputePipelineCreateInfo cinfo = { + .sType = VK_STRUCTURE_TYPE_COMPUTE_PIPELINE_CREATE_INFO, + .stage = { + .sType = VK_STRUCTURE_TYPE_PIPELINE_SHADER_STAGE_CREATE_INFO, + .stage = VK_SHADER_STAGE_COMPUTE_BIT, + .module = pass_vk->comp, + .pName = "main", + }, + .layout = pass_vk->pipeLayout, + }; + + VK(vkCreateComputePipelines(vk->dev, pass_vk->pipeCache, 1, &cinfo, + MPVK_ALLOCATOR, &pass_vk->pipe)); + break; + } + } + + // Update cached program + bstr *prog = &pass->params.cached_program; + VK(vkGetPipelineCacheData(vk->dev, pass_vk->pipeCache, &prog->len, NULL)); + prog->start = talloc_size(pass, prog->len); + VK(vkGetPipelineCacheData(vk->dev, pass_vk->pipeCache, &prog->len, prog->start)); + + return pass; + +error: + vk_renderpass_destroy(ra, pass); + return NULL; +} + +static void vk_update_descriptor(struct vk_cmd *cmd, + struct ra_renderpass *pass, + struct ra_renderpass_input_val val, + VkDescriptorSet ds, int idx) +{ + struct ra_renderpass_vk *pass_vk = pass->priv; + struct ra_renderpass_input *inp = &pass->params.inputs[val.index]; + + VkWriteDescriptorSet *wds = &pass_vk->dswrite[idx]; + *wds = (VkWriteDescriptorSet) { + .sType = VK_STRUCTURE_TYPE_WRITE_DESCRIPTOR_SET, + .dstSet = ds, + .dstBinding = inp->binding, + .descriptorCount = 1, + .descriptorType = dsType[inp->type], + }; + + switch (inp->type) { + case RA_VARTYPE_TEX: { + struct ra_tex *tex = *(struct ra_tex **)val.data; + struct ra_tex_vk *tex_vk = tex->priv; + + assert(tex->params.render_src); + tex_barrier(cmd, tex_vk, VK_PIPELINE_STAGE_FRAGMENT_SHADER_BIT, + VK_ACCESS_SHADER_READ_BIT, + VK_IMAGE_LAYOUT_SHADER_READ_ONLY_OPTIMAL, false); + + VkDescriptorImageInfo *iinfo = &pass_vk->dsiinfo[idx]; + *iinfo = (VkDescriptorImageInfo) { + .sampler = tex_vk->sampler, + .imageView = tex_vk->view, + .imageLayout = tex_vk->current_layout, + }; + + wds->pImageInfo = iinfo; + break; + } + case RA_VARTYPE_IMG_W: { + struct ra_tex *tex = *(struct ra_tex **)val.data; + struct ra_tex_vk *tex_vk = tex->priv; + + assert(tex->params.storage_dst); + tex_barrier(cmd, tex_vk, VK_PIPELINE_STAGE_FRAGMENT_SHADER_BIT, + VK_ACCESS_SHADER_WRITE_BIT, + VK_IMAGE_LAYOUT_GENERAL, false); + + VkDescriptorImageInfo *iinfo = &pass_vk->dsiinfo[idx]; + *iinfo = (VkDescriptorImageInfo) { + .imageView = tex_vk->view, + .imageLayout = tex_vk->current_layout, + }; + + wds->pImageInfo = iinfo; + break; + } + case RA_VARTYPE_BUF_RO: + case RA_VARTYPE_BUF_RW: { + struct ra_buf *buf = *(struct ra_buf **)val.data; + struct ra_buf_vk *buf_vk = buf->priv; + + VkBufferUsageFlags access = VK_ACCESS_SHADER_READ_BIT; + if (inp->type == RA_VARTYPE_BUF_RW) + access |= VK_ACCESS_SHADER_WRITE_BIT; + + buf_barrier(cmd, buf, VK_PIPELINE_STAGE_FRAGMENT_SHADER_BIT, + access, buf_vk->slice.mem.offset, buf->params.size); + + VkDescriptorBufferInfo *binfo = &pass_vk->dsbinfo[idx]; + *binfo = (VkDescriptorBufferInfo) { + .buffer = buf_vk->slice.buf, + .offset = buf_vk->slice.mem.offset, + .range = buf->params.size, + }; + + wds->pBufferInfo = binfo; + break; + } + } +} + +static void vk_renderpass_run(struct ra *ra, + const struct ra_renderpass_run_params *params) +{ + struct mpvk_ctx *vk = vk_get(ra); + struct ra_renderpass *pass = params->pass; + struct ra_renderpass_vk *pass_vk = pass->priv; + + struct vk_cmd *cmd = vk_require_cmd(ra); + if (!cmd) + goto error; + + static const VkPipelineBindPoint bindPoint[] = { + [RA_RENDERPASS_TYPE_RASTER] = VK_PIPELINE_BIND_POINT_GRAPHICS, + [RA_RENDERPASS_TYPE_COMPUTE] = VK_PIPELINE_BIND_POINT_COMPUTE, + }; + + vkCmdBindPipeline(cmd->buf, bindPoint[pass->params.type], pass_vk->pipe); + + VkDescriptorSet ds = pass_vk->dss[pass_vk->dindex++]; + pass_vk->dindex %= MPVK_NUM_DS; + + for (int i = 0; i < params->num_values; i++) + vk_update_descriptor(cmd, pass, params->values[i], ds, i); + + if (params->num_values > 0) { + vkUpdateDescriptorSets(vk->dev, params->num_values, pass_vk->dswrite, + 0, NULL); + } + + vkCmdBindDescriptorSets(cmd->buf, bindPoint[pass->params.type], + pass_vk->pipeLayout, 0, 1, &ds, 0, NULL); + + switch (pass->params.type) { + case RA_RENDERPASS_TYPE_COMPUTE: + vkCmdDispatch(cmd->buf, params->compute_groups[0], + params->compute_groups[1], + params->compute_groups[2]); + break; + case RA_RENDERPASS_TYPE_RASTER: { + struct ra_tex *tex = params->target; + struct ra_tex_vk *tex_vk = tex->priv; + assert(tex->params.render_dst); + + struct ra_buf_params buf_params = { + .type = RA_BUF_TYPE_VERTEX, + .size = params->vertex_count * pass->params.vertex_stride, + .host_mutable = true, + }; + + struct ra_buf *buf = ra_buf_pool_get(ra, &pass_vk->vbo, &buf_params); + if (!buf) { + MP_ERR(ra, "Failed allocating vertex buffer!\n"); + goto error; + } + struct ra_buf_vk *buf_vk = buf->priv; + + vk_buf_update(ra, buf, 0, params->vertex_data, buf_params.size); + + buf_barrier(cmd, buf, VK_PIPELINE_STAGE_VERTEX_INPUT_BIT, + VK_ACCESS_VERTEX_ATTRIBUTE_READ_BIT, + buf_vk->slice.mem.offset, buf_params.size); + + vkCmdBindVertexBuffers(cmd->buf, 0, 1, &buf_vk->slice.buf, + &buf_vk->slice.mem.offset); + + tex_barrier(cmd, tex_vk, VK_PIPELINE_STAGE_COLOR_ATTACHMENT_OUTPUT_BIT, + VK_ACCESS_COLOR_ATTACHMENT_WRITE_BIT, + VK_IMAGE_LAYOUT_COLOR_ATTACHMENT_OPTIMAL, false); + + VkViewport viewport = { + .x = params->viewport.x0, + .y = params->viewport.y0, + .width = mp_rect_w(params->viewport), + .height = mp_rect_h(params->viewport), + }; + + VkRect2D scissor = { + .offset = {params->scissors.x0, params->scissors.y0}, + .extent = {mp_rect_w(params->scissors), mp_rect_h(params->scissors)}, + }; + + vkCmdSetViewport(cmd->buf, 0, 1, &viewport); + vkCmdSetScissor(cmd->buf, 0, 1, &scissor); + + VkRenderPassBeginInfo binfo = { + .sType = VK_STRUCTURE_TYPE_RENDER_PASS_BEGIN_INFO, + .renderPass = pass_vk->renderPass, + .framebuffer = tex_vk->framebuffer, + .renderArea = (VkRect2D){{0, 0}, {tex->params.w, tex->params.h}}, + }; + + vkCmdBeginRenderPass(cmd->buf, &binfo, VK_SUBPASS_CONTENTS_INLINE); + vkCmdDraw(cmd->buf, params->vertex_count, 1, 0, 0); + vkCmdEndRenderPass(cmd->buf); + break; + } + default: abort(); + }; + +error: + return; +} + +static void vk_blit(struct ra *ra, struct ra_tex *dst, struct ra_tex *src, + struct mp_rect *dst_rc, struct mp_rect *src_rc) +{ + assert(src->params.blit_src); + assert(dst->params.blit_dst); + + struct ra_tex_vk *src_vk = src->priv; + struct ra_tex_vk *dst_vk = dst->priv; + + struct vk_cmd *cmd = vk_require_cmd(ra); + if (!cmd) + return; + + tex_barrier(cmd, src_vk, VK_PIPELINE_STAGE_TRANSFER_BIT, + VK_ACCESS_TRANSFER_READ_BIT, + VK_IMAGE_LAYOUT_TRANSFER_SRC_OPTIMAL, + false); + + bool discard = dst_rc->x0 == 0 && + dst_rc->y0 == 0 && + dst_rc->x1 == dst->params.w && + dst_rc->y1 == dst->params.h; + + tex_barrier(cmd, dst_vk, VK_PIPELINE_STAGE_TRANSFER_BIT, + VK_ACCESS_TRANSFER_WRITE_BIT, + VK_IMAGE_LAYOUT_TRANSFER_DST_OPTIMAL, + discard); + + VkImageBlit region = { + .srcSubresource = vk_layers, + .srcOffsets = {{src_rc->x0, src_rc->y0, 0}, {src_rc->x1, src_rc->y1, 1}}, + .dstSubresource = vk_layers, + .dstOffsets = {{dst_rc->x0, dst_rc->y0, 0}, {dst_rc->x1, dst_rc->y1, 1}}, + }; + + vkCmdBlitImage(cmd->buf, src_vk->img, src_vk->current_layout, dst_vk->img, + dst_vk->current_layout, 1, ®ion, VK_FILTER_NEAREST); +} + +static void vk_clear(struct ra *ra, struct ra_tex *tex, float color[4], + struct mp_rect *rc) +{ + struct ra_vk *p = ra->priv; + struct ra_tex_vk *tex_vk = tex->priv; + assert(tex->params.blit_dst); + + struct vk_cmd *cmd = vk_require_cmd(ra); + if (!cmd) + return; + + struct mp_rect full = {0, 0, tex->params.w, tex->params.h}; + if (!rc || mp_rect_equals(rc, &full)) { + // To clear the entire image, we can use the efficient clear command + tex_barrier(cmd, tex_vk, VK_PIPELINE_STAGE_TRANSFER_BIT, + VK_ACCESS_TRANSFER_WRITE_BIT, + VK_IMAGE_LAYOUT_TRANSFER_DST_OPTIMAL, true); + + VkClearColorValue clearColor = {0}; + for (int c = 0; c < 4; c++) + clearColor.float32[c] = color[c]; + + vkCmdClearColorImage(cmd->buf, tex_vk->img, tex_vk->current_layout, + &clearColor, 1, &vk_range); + } else { + // To simulate per-region clearing, we blit from a 1x1 texture instead + struct ra_tex_upload_params ul_params = { + .tex = p->clear_tex, + .invalidate = true, + .src = &color[0], + }; + vk_tex_upload(ra, &ul_params); + vk_blit(ra, tex, p->clear_tex, rc, &(struct mp_rect){0, 0, 1, 1}); + } +} + +#define VK_QUERY_POOL_SIZE (MPVK_MAX_STREAMING_DEPTH * 4) + +struct vk_timer { + VkQueryPool pool; + int index; + uint64_t result; +}; + +static void vk_timer_destroy(struct ra *ra, ra_timer *ratimer) +{ + if (!ratimer) + return; + + struct mpvk_ctx *vk = vk_get(ra); + struct vk_timer *timer = ratimer; + + vkDestroyQueryPool(vk->dev, timer->pool, MPVK_ALLOCATOR); + + talloc_free(timer); +} + +MAKE_LAZY_DESTRUCTOR(vk_timer_destroy, ra_timer); + +static ra_timer *vk_timer_create(struct ra *ra) +{ + struct mpvk_ctx *vk = vk_get(ra); + + struct vk_timer *timer = talloc_zero(NULL, struct vk_timer); + + struct VkQueryPoolCreateInfo qinfo = { + .sType = VK_STRUCTURE_TYPE_QUERY_POOL_CREATE_INFO, + .queryType = VK_QUERY_TYPE_TIMESTAMP, + .queryCount = VK_QUERY_POOL_SIZE, + }; + + VK(vkCreateQueryPool(vk->dev, &qinfo, MPVK_ALLOCATOR, &timer->pool)); + + return (ra_timer *)timer; + +error: + vk_timer_destroy(ra, timer); + return NULL; +} + +static void vk_timer_start(struct ra *ra, ra_timer *ratimer) +{ + struct mpvk_ctx *vk = vk_get(ra); + struct vk_timer *timer = ratimer; + + struct vk_cmd *cmd = vk_require_cmd(ra); + if (!cmd) + return; + + timer->index = (timer->index + 2) % VK_QUERY_POOL_SIZE; + + uint64_t out[2]; + VkResult res = vkGetQueryPoolResults(vk->dev, timer->pool, timer->index, 2, + sizeof(out), &out[0], sizeof(uint64_t), + VK_QUERY_RESULT_64_BIT); + switch (res) { + case VK_SUCCESS: + timer->result = out[1] - out[0]; + break; + case VK_NOT_READY: + timer->result = 0; + break; + default: + MP_WARN(vk, "Failed reading timer query result: %s\n", vk_err(res)); + return; + }; + + vkCmdWriteTimestamp(cmd->buf, VK_PIPELINE_STAGE_BOTTOM_OF_PIPE_BIT, + timer->pool, timer->index); +} + +static uint64_t vk_timer_stop(struct ra *ra, ra_timer *ratimer) +{ + struct vk_timer *timer = ratimer; + struct vk_cmd *cmd = vk_require_cmd(ra); + + if (cmd) { + vkCmdWriteTimestamp(cmd->buf, VK_PIPELINE_STAGE_BOTTOM_OF_PIPE_BIT, + timer->pool, timer->index + 1); + } + + return timer->result; +} + +static struct ra_fns ra_fns_vk = { + .destroy = vk_destroy_ra, + .tex_create = vk_tex_create, + .tex_destroy = vk_tex_destroy_lazy, + .tex_upload = vk_tex_upload, + .buf_create = vk_buf_create, + .buf_destroy = vk_buf_destroy_lazy, + .buf_update = vk_buf_update, + .buf_poll = vk_buf_poll, + .clear = vk_clear, + .blit = vk_blit, + .renderpass_create = vk_renderpass_create, + .renderpass_destroy = vk_renderpass_destroy_lazy, + .renderpass_run = vk_renderpass_run, + .timer_create = vk_timer_create, + .timer_destroy = vk_timer_destroy_lazy, + .timer_start = vk_timer_start, + .timer_stop = vk_timer_stop, +}; + +static void present_cb(struct ra *ra, int *inflight) +{ + *inflight -= 1; +} + +bool ra_vk_present_frame(struct ra *ra, struct vk_swimg *swimg, int *inflight) +{ + struct ra_vk *p = ra->priv; + struct mpvk_ctx *vk = vk_get(ra); + assert(p->active_cmd); + + if (inflight) { + *inflight += 1; + vk_callback(ra, (vk_cb)present_cb, inflight); + } + + struct ra_tex *img = swimg->image; + + tex_barrier(p->active_cmd, img->priv, VK_PIPELINE_STAGE_BOTTOM_OF_PIPE_BIT, + 0, VK_IMAGE_LAYOUT_PRESENT_SRC_KHR, false); + + // These are the only two stages that we use/support for actually + // outputting to swapchain imagechain images, so just add a dependency + // on both of them. In theory, we could maybe come up with some more + // advanced mechanism of tracking dynamic dependencies, but that seems + // like overkill. + vk_cmd_dep(p->active_cmd, swimg->acquired, + VK_PIPELINE_STAGE_COLOR_ATTACHMENT_OUTPUT_BIT | + VK_PIPELINE_STAGE_TRANSFER_BIT); + + VkSemaphore done; + if (!vk_cmd_submit(vk, p->active_cmd, &done)) + goto error; + p->active_cmd = NULL; + + struct vk_cmdpool *pool = vk->pool; + VkQueue queue = pool->queues[pool->qindex]; + pool->qindex %= pool->qcount; + + VkPresentInfoKHR pinfo = { + .sType = VK_STRUCTURE_TYPE_PRESENT_INFO_KHR, + .waitSemaphoreCount = 1, + .pWaitSemaphores = &done, + .swapchainCount = 1, + .pSwapchains = &swimg->chain->swchain, + .pImageIndices = &swimg->index, + }; + + VK(vkQueuePresentKHR(queue, &pinfo)); + + return true; + +error: + return false; +} diff --git a/video/out/vulkan/ra_vk.h b/video/out/vulkan/ra_vk.h new file mode 100644 index 0000000000000..214a9af6f3552 --- /dev/null +++ b/video/out/vulkan/ra_vk.h @@ -0,0 +1,25 @@ +#pragma once + +#include "common.h" +#include "utils.h" +#include "video/out/opengl/ra.h" + +struct ra *ra_create_vk(struct mpvk_ctx *vk, struct mp_log *log); + +// Access to the VkDevice is needed for swapchain creation +VkDevice ra_vk_get_dev(struct ra *ra); + +// Allocates a ra_tex that wraps a swapchain image. The contents of the image +// will be invalidated, and access to it will only be internally synchronized. +// So the calling could should not do anything else with the VkImage. +struct ra_tex *ra_vk_wrap_swchain_img(struct ra *ra, VkImage vkimg, + VkSwapchainCreateInfoKHR info); + +// This function flushes the command buffers, and enqueues the image for +// presentation. This command must only be used after drawing to the vk_swchain, +// but before the command buffers are flushed for other reasons (for +// synchronization). The frames_in_flight pointer will be used to track how +// many frames are currently in flight. (That is, it will be incremented when +// this function is called, and decremented when the command completes) +bool ra_vk_present_frame(struct ra *ra, struct vk_swimg *swimg, + int *frames_in_flight); diff --git a/video/out/vulkan/utils.c b/video/out/vulkan/utils.c new file mode 100644 index 0000000000000..6c14bce2455d4 --- /dev/null +++ b/video/out/vulkan/utils.c @@ -0,0 +1,936 @@ +#include + +#include "utils.h" +#include "malloc.h" +#include "ra_vk.h" +#include "video/out/x11_common.h" + +const char* vk_err(VkResult res) +{ + switch (res) { + // These are technically success codes, but include them nonetheless + case VK_SUCCESS: return "VK_SUCCESS"; + case VK_NOT_READY: return "VK_NOT_READY"; + case VK_TIMEOUT: return "VK_TIMEOUT"; + case VK_EVENT_SET: return "VK_EVENT_SET"; + case VK_EVENT_RESET: return "VK_EVENT_RESET"; + case VK_INCOMPLETE: return "VK_INCOMPLETE"; + + // Actual error codes + case VK_ERROR_OUT_OF_HOST_MEMORY: return "VK_ERROR_OUT_OF_HOST_MEMORY"; + case VK_ERROR_OUT_OF_DEVICE_MEMORY: return "VK_ERROR_OUT_OF_DEVICE_MEMORY"; + case VK_ERROR_INITIALIZATION_FAILED: return "VK_ERROR_INITIALIZATION_FAILED"; + case VK_ERROR_DEVICE_LOST: return "VK_ERROR_DEVICE_LOST"; + case VK_ERROR_MEMORY_MAP_FAILED: return "VK_ERROR_MEMORY_MAP_FAILED"; + case VK_ERROR_LAYER_NOT_PRESENT: return "VK_ERROR_LAYER_NOT_PRESENT"; + case VK_ERROR_EXTENSION_NOT_PRESENT: return "VK_ERROR_EXTENSION_NOT_PRESENT"; + case VK_ERROR_FEATURE_NOT_PRESENT: return "VK_ERROR_FEATURE_NOT_PRESENT"; + case VK_ERROR_INCOMPATIBLE_DRIVER: return "VK_ERROR_INCOMPATIBLE_DRIVER"; + case VK_ERROR_TOO_MANY_OBJECTS: return "VK_ERROR_TOO_MANY_OBJECTS"; + case VK_ERROR_FORMAT_NOT_SUPPORTED: return "VK_ERROR_FORMAT_NOT_SUPPORTED"; + case VK_ERROR_FRAGMENTED_POOL: return "VK_ERROR_FRAGMENTED_POOL"; + } + + return "Unknown error!"; +} + +static const char* vk_dbg_type(VkDebugReportObjectTypeEXT type) +{ + switch (type) { + case VK_DEBUG_REPORT_OBJECT_TYPE_INSTANCE_EXT: + return "VkInstance"; + case VK_DEBUG_REPORT_OBJECT_TYPE_PHYSICAL_DEVICE_EXT: + return "VkPhysicalDevice"; + case VK_DEBUG_REPORT_OBJECT_TYPE_DEVICE_EXT: + return "VkDevice"; + case VK_DEBUG_REPORT_OBJECT_TYPE_QUEUE_EXT: + return "VkQueue"; + case VK_DEBUG_REPORT_OBJECT_TYPE_SEMAPHORE_EXT: + return "VkSemaphore"; + case VK_DEBUG_REPORT_OBJECT_TYPE_COMMAND_BUFFER_EXT: + return "VkCommandBuffer"; + case VK_DEBUG_REPORT_OBJECT_TYPE_FENCE_EXT: + return "VkFence"; + case VK_DEBUG_REPORT_OBJECT_TYPE_DEVICE_MEMORY_EXT: + return "VkDeviceMemory"; + case VK_DEBUG_REPORT_OBJECT_TYPE_BUFFER_EXT: + return "VkBuffer"; + case VK_DEBUG_REPORT_OBJECT_TYPE_IMAGE_EXT: + return "VkImage"; + case VK_DEBUG_REPORT_OBJECT_TYPE_EVENT_EXT: + return "VkEvent"; + case VK_DEBUG_REPORT_OBJECT_TYPE_QUERY_POOL_EXT: + return "VkQueryPool"; + case VK_DEBUG_REPORT_OBJECT_TYPE_BUFFER_VIEW_EXT: + return "VkBufferView"; + case VK_DEBUG_REPORT_OBJECT_TYPE_IMAGE_VIEW_EXT: + return "VkImageView"; + case VK_DEBUG_REPORT_OBJECT_TYPE_SHADER_MODULE_EXT: + return "VkShaderModule"; + case VK_DEBUG_REPORT_OBJECT_TYPE_PIPELINE_CACHE_EXT: + return "VkPipelineCache"; + case VK_DEBUG_REPORT_OBJECT_TYPE_PIPELINE_LAYOUT_EXT: + return "VkPipelineLayout"; + case VK_DEBUG_REPORT_OBJECT_TYPE_RENDER_PASS_EXT: + return "VkRenderPass"; + case VK_DEBUG_REPORT_OBJECT_TYPE_PIPELINE_EXT: + return "VkPipeline"; + case VK_DEBUG_REPORT_OBJECT_TYPE_DESCRIPTOR_SET_LAYOUT_EXT: + return "VkDescriptorSetLayout"; + case VK_DEBUG_REPORT_OBJECT_TYPE_SAMPLER_EXT: + return "VkSampler"; + case VK_DEBUG_REPORT_OBJECT_TYPE_DESCRIPTOR_POOL_EXT: + return "VkDescriptorPool"; + case VK_DEBUG_REPORT_OBJECT_TYPE_DESCRIPTOR_SET_EXT: + return "VkDescriptorSet"; + case VK_DEBUG_REPORT_OBJECT_TYPE_FRAMEBUFFER_EXT: + return "VkFramebuffer"; + case VK_DEBUG_REPORT_OBJECT_TYPE_COMMAND_POOL_EXT: + return "VkCommandPool"; + case VK_DEBUG_REPORT_OBJECT_TYPE_SURFACE_KHR_EXT: + return "VkSurfaceKHR"; + case VK_DEBUG_REPORT_OBJECT_TYPE_SWAPCHAIN_KHR_EXT: + return "VkSwapchainKHR"; + case VK_DEBUG_REPORT_OBJECT_TYPE_DEBUG_REPORT_EXT: + return "VkDebugReportCallbackEXT"; + case VK_DEBUG_REPORT_OBJECT_TYPE_UNKNOWN_EXT: + default: + return "unknown object"; + } +} + +static VkBool32 vk_dbg_callback(VkDebugReportFlagsEXT flags, + VkDebugReportObjectTypeEXT objType, + uint64_t obj, size_t loc, int32_t msgCode, + const char *layer, const char *msg, void *priv) +{ + struct mpvk_ctx *vk = priv; + int lev = MSGL_V; + + switch (flags) { + case VK_DEBUG_REPORT_ERROR_BIT_EXT: lev = MSGL_ERR; break; + case VK_DEBUG_REPORT_WARNING_BIT_EXT: lev = MSGL_WARN; break; + case VK_DEBUG_REPORT_INFORMATION_BIT_EXT: lev = MSGL_TRACE; break; + case VK_DEBUG_REPORT_PERFORMANCE_WARNING_BIT_EXT: lev = MSGL_WARN; break; + case VK_DEBUG_REPORT_DEBUG_BIT_EXT: lev = MSGL_DEBUG; break; + }; + + MP_MSG(vk, lev, "vk [%s] %d: %s (obj 0x%lx (%s), loc 0x%lx)\n", + layer, msgCode, msg, obj, vk_dbg_type(objType), loc); + + // The return value of this function determines whether the call will + // be explicitly aborted (to prevent GPU errors) or not. In this case, + // we generally want this to be on for the errors. + return (flags & VK_DEBUG_REPORT_ERROR_BIT_EXT); +} + +void mpvk_uninit(struct mpvk_ctx *vk) +{ + if (!vk->inst) + return; + + if (vk->dev) { + struct vk_cmdpool *pool = vk->pool; + // also frees associated command buffers + vkDestroyCommandPool(vk->dev, pool->pool, MPVK_ALLOCATOR); + for (int n = 0; n < MPVK_MAX_CMDS; n++) { + vkDestroyFence(vk->dev, pool->cmds[n].fence, MPVK_ALLOCATOR); + vkDestroySemaphore(vk->dev, pool->cmds[n].done, MPVK_ALLOCATOR); + talloc_free(pool->cmds[n].callbacks); + } + talloc_free(vk->pool); + vk_malloc_uninit(vk); + vkDestroyDevice(vk->dev, MPVK_ALLOCATOR); + } + + if (vk->dbg) { + // Same deal as creating the debug callback, we need to load this + // first. + VK_LOAD_PFN(vkDestroyDebugReportCallbackEXT) + pfn_vkDestroyDebugReportCallbackEXT(vk->inst, vk->dbg, MPVK_ALLOCATOR); + } + + vkDestroySurfaceKHR(vk->inst, vk->surf, MPVK_ALLOCATOR); + vkDestroyInstance(vk->inst, MPVK_ALLOCATOR); + + *vk = (struct mpvk_ctx){0}; +} + +bool mpvk_instance_init(struct mpvk_ctx *vk, bool debug) +{ + VkInstanceCreateInfo info = { + .sType = VK_STRUCTURE_TYPE_INSTANCE_CREATE_INFO, + }; + + if (debug) { + // Enables the LunarG standard validation layer, which + // is a meta-layer that loads lots of other validators + static const char* layers[] = { + "VK_LAYER_LUNARG_standard_validation", + }; + + info.ppEnabledLayerNames = layers; + info.enabledLayerCount = MP_ARRAY_SIZE(layers); + } + + // Enable whatever extensions were compiled in. + static const char *extensions[] = { + VK_KHR_SURFACE_EXTENSION_NAME, +#if HAVE_VULKAN_XLIB + VK_KHR_XLIB_SURFACE_EXTENSION_NAME, +#endif + + // Extra extensions only used for debugging. These are toggled by + // decreasing the enabledExtensionCount, so the number needs to be + // synchronized with the code below. + VK_EXT_DEBUG_REPORT_EXTENSION_NAME, + }; + + const int debugExtensionCount = 1; + + info.ppEnabledExtensionNames = extensions; + info.enabledExtensionCount = MP_ARRAY_SIZE(extensions); + + if (!debug) + info.enabledExtensionCount -= debugExtensionCount; + + VkResult res = vkCreateInstance(&info, MPVK_ALLOCATOR, &vk->inst); + if (res != VK_SUCCESS) { + MP_VERBOSE(vk, "failed creating instance: %s\n", vk_err(res)); + return false; + } + + if (debug) { + // Set up a debug callback to catch validation messages + VkDebugReportCallbackCreateInfoEXT dinfo = { + .sType = VK_STRUCTURE_TYPE_DEBUG_REPORT_CALLBACK_CREATE_INFO_EXT, + .flags = VK_DEBUG_REPORT_INFORMATION_BIT_EXT | + VK_DEBUG_REPORT_WARNING_BIT_EXT | + VK_DEBUG_REPORT_PERFORMANCE_WARNING_BIT_EXT | + VK_DEBUG_REPORT_ERROR_BIT_EXT | + VK_DEBUG_REPORT_DEBUG_BIT_EXT, + .pfnCallback = vk_dbg_callback, + .pUserData = vk, + }; + + // Since this is not part of the core spec, we need to load it. This + // can't fail because we've already successfully created an instance + // with this extension enabled. + VK_LOAD_PFN(vkCreateDebugReportCallbackEXT) + pfn_vkCreateDebugReportCallbackEXT(vk->inst, &dinfo, MPVK_ALLOCATOR, + &vk->dbg); + } + + return true; +} + +#define MPVK_MAX_DEVICES 16 + +static bool physd_supports_surface(struct mpvk_ctx *vk, VkPhysicalDevice physd) +{ + uint32_t qfnum; + vkGetPhysicalDeviceQueueFamilyProperties(physd, &qfnum, NULL); + + for (int i = 0; i < qfnum; i++) { + VkBool32 sup; + VK(vkGetPhysicalDeviceSurfaceSupportKHR(physd, i, vk->surf, &sup)); + if (sup) + return true; + } + +error: + return false; +} + +bool mpvk_find_phys_device(struct mpvk_ctx *vk, const char *name, bool sw) +{ + assert(vk->surf); + + MP_VERBOSE(vk, "Probing for vulkan devices..\n"); + + VkPhysicalDevice *devices = NULL; + uint32_t num = 0; + VK(vkEnumeratePhysicalDevices(vk->inst, &num, NULL)); + devices = talloc_array(NULL, VkPhysicalDevice, num); + VK(vkEnumeratePhysicalDevices(vk->inst, &num, devices)); + + // Sorted by "priority". Reuses some m_opt code for convenience + static const struct m_opt_choice_alternatives types[] = { + {"discrete", VK_PHYSICAL_DEVICE_TYPE_DISCRETE_GPU}, + {"integrated", VK_PHYSICAL_DEVICE_TYPE_INTEGRATED_GPU}, + {"virtual", VK_PHYSICAL_DEVICE_TYPE_VIRTUAL_GPU}, + {"software", VK_PHYSICAL_DEVICE_TYPE_CPU}, + {"unknown", VK_PHYSICAL_DEVICE_TYPE_OTHER}, + {0} + }; + + VkPhysicalDeviceProperties props[MPVK_MAX_DEVICES]; + for (int i = 0; i < num; i++) { + vkGetPhysicalDeviceProperties(devices[i], &props[i]); + MP_VERBOSE(vk, "GPU %d: %s (%s)\n", i, props[i].deviceName, + m_opt_choice_str(types, props[i].deviceType)); + } + + // Iterate through each type in order of decreasing preference + for (int t = 0; types[t].name; t++) { + // Disallow SW rendering unless explicitly enabled + if (types[t].value == VK_PHYSICAL_DEVICE_TYPE_CPU && !sw) + continue; + + for (int i = 0; i < num; i++) { + VkPhysicalDeviceProperties prop = props[i]; + if (prop.deviceType != types[t].value) + continue; + if (name && strcmp(name, prop.deviceName) != 0) + continue; + if (!physd_supports_surface(vk, devices[i])) + continue; + + MP_VERBOSE(vk, "Found device:\n"); + MP_VERBOSE(vk, " Device Name: %s\n", prop.deviceName); + MP_VERBOSE(vk, " Device ID: %x:%x\n", prop.vendorID, prop.deviceID); + MP_VERBOSE(vk, " Driver version: %d\n", prop.driverVersion); + MP_VERBOSE(vk, " API version: %d.%d.%d\n", + VK_VERSION_MAJOR(prop.apiVersion), + VK_VERSION_MINOR(prop.apiVersion), + VK_VERSION_PATCH(prop.apiVersion)); + vk->physd = devices[i]; + vk->limits = prop.limits; + talloc_free(devices); + return true; + } + } + +error: + MP_VERBOSE(vk, "Found no suitable device, giving up.\n"); + talloc_free(devices); + return false; +} + +bool mpvk_pick_surface_format(struct mpvk_ctx *vk) +{ + assert(vk->physd); + + VkSurfaceFormatKHR *formats = NULL; + int num; + + // Enumerate through the surface formats and find one that we can map to + // a ra_format + VK(vkGetPhysicalDeviceSurfaceFormatsKHR(vk->physd, vk->surf, &num, NULL)); + formats = talloc_array(NULL, VkSurfaceFormatKHR, num); + VK(vkGetPhysicalDeviceSurfaceFormatsKHR(vk->physd, vk->surf, &num, formats)); + + for (int i = 0; i < num; i++) { + // A value of VK_FORMAT_UNDEFINED means we can pick anything we want + if (formats[i].format == VK_FORMAT_UNDEFINED) { + vk->surf_format = (VkSurfaceFormatKHR) { + .colorSpace = VK_COLOR_SPACE_SRGB_NONLINEAR_KHR, + .format = VK_FORMAT_R8G8B8A8_UNORM, + }; + break; + } + + if (formats[i].colorSpace != VK_COLOR_SPACE_SRGB_NONLINEAR_KHR) + continue; + + vk->surf_format = formats[i]; + break; + } + + talloc_free(formats); + + if (!vk->surf_format.format) + goto error; + + return true; + +error: + MP_ERR(vk, "Failed picking surface format!\n"); + talloc_free(formats); + return false; +} + +bool mpvk_surface_init(struct vo *vo, struct mpvk_ctx *vk) +{ + assert(vk->inst); + VkResult res; + +#if HAVE_VULKAN_XLIB + if (!vo_x11_init(vo)) + goto xlib_uninit; + + if (!vo_x11_create_vo_window(vo, NULL, "mpvk")) + goto xlib_uninit; + + VkXlibSurfaceCreateInfoKHR xinfo = { + .sType = VK_STRUCTURE_TYPE_XLIB_SURFACE_CREATE_INFO_KHR, + .dpy = vo->x11->display, + .window = vo->x11->window, + }; + + res = vkCreateXlibSurfaceKHR(vk->inst, &xinfo, MPVK_ALLOCATOR, &vk->surf); + if (res != VK_SUCCESS) { + MP_VERBOSE(vo, "Failed creating Xlib surface: %s\n", vk_err(res)); + goto xlib_uninit; + } + + MP_VERBOSE(vo, "Using Xlib surface.\n"); + return true; + +xlib_uninit: + vo_x11_uninit(vo); +#endif + + // If we're reached this point, then none of the above surface probes + // were successful + MP_ERR(vo, "Failed creating any useful vulkan surface!\n"); + return false; +} + +bool mpvk_device_init(struct mpvk_ctx *vk) +{ + assert(vk->physd); + + VkQueueFamilyProperties *qfs = NULL; + int qfnum; + + // Enumerate the queue families and find suitable families for each task + vkGetPhysicalDeviceQueueFamilyProperties(vk->physd, &qfnum, NULL); + qfs = talloc_array(NULL, VkQueueFamilyProperties, qfnum); + vkGetPhysicalDeviceQueueFamilyProperties(vk->physd, &qfnum, qfs); + + MP_VERBOSE(vk, "Queue families supported by device:\n"); + + for (int i = 0; i < qfnum; i++) { + MP_VERBOSE(vk, "QF %d: flags 0x%x num %d\n", i, qfs[i].queueFlags, + qfs[i].queueCount); + } + + // Since using multiple queue families is devilishly difficult, we just + // pick a single queue family and stick with it. So in the interest of this, + // it's best to pick the one that supports the most features. + + int idx = -1; + for (int i = 0; i < qfnum; i++) { + if (!(qfs[i].queueFlags & VK_QUEUE_GRAPHICS_BIT)) + continue; + + // QF supports more features + if (idx < 0 || qfs[i].queueFlags > qfs[idx].queueFlags) + idx = i; + + // QF supports more queues (at the same specialization level) + if (qfs[i].queueFlags == qfs[idx].queueFlags && + qfs[i].queueCount > qfs[idx].queueCount) + { + idx = i; + } + } + + // Vulkan requires at least one GRAPHICS queue, so if this fails something + // is horribly wrong. + assert(idx >= 0); + + // Now that we know which queue family we want, we can create the logical + // device + static const float priorities[MPVK_MAX_QUEUES] = {0}; + VkDeviceQueueCreateInfo qinfo = { + .sType = VK_STRUCTURE_TYPE_DEVICE_QUEUE_CREATE_INFO, + .queueFamilyIndex = idx, + .queueCount = MPMIN(qfs[idx].queueCount, MPVK_MAX_QUEUES), + .pQueuePriorities = priorities, + }; + + static const char *exts[] = { + VK_KHR_SWAPCHAIN_EXTENSION_NAME, + VK_NV_GLSL_SHADER_EXTENSION_NAME, + }; + + VkDeviceCreateInfo dinfo = { + .sType = VK_STRUCTURE_TYPE_DEVICE_CREATE_INFO, + .queueCreateInfoCount = 1, + .pQueueCreateInfos = &qinfo, + .ppEnabledExtensionNames = exts, + .enabledExtensionCount = MP_ARRAY_SIZE(exts), + }; + + MP_VERBOSE(vk, "Creating vulkan device...\n"); + VK(vkCreateDevice(vk->physd, &dinfo, MPVK_ALLOCATOR, &vk->dev)); + + vk_malloc_init(vk); + + // Create the vk_cmdpool and all required queues / synchronization objects + struct vk_cmdpool *pool = vk->pool = talloc_zero(NULL, struct vk_cmdpool); + *pool = (struct vk_cmdpool) { + .qf = qinfo.queueFamilyIndex, + .props = qfs[qinfo.queueFamilyIndex], + .qcount = qinfo.queueCount, + }; + + talloc_free(qfs); + + for (int n = 0; n < pool->qcount; n++) + vkGetDeviceQueue(vk->dev, pool->qf, n, &pool->queues[n]); + + VkCommandPoolCreateInfo cinfo = { + .sType = VK_STRUCTURE_TYPE_COMMAND_POOL_CREATE_INFO, + .flags = VK_COMMAND_POOL_CREATE_TRANSIENT_BIT | + VK_COMMAND_POOL_CREATE_RESET_COMMAND_BUFFER_BIT, + .queueFamilyIndex = pool->qf, + }; + + VK(vkCreateCommandPool(vk->dev, &cinfo, MPVK_ALLOCATOR, &pool->pool)); + + VkCommandBufferAllocateInfo ainfo = { + .sType = VK_STRUCTURE_TYPE_COMMAND_BUFFER_ALLOCATE_INFO, + .commandPool = pool->pool, + .level = VK_COMMAND_BUFFER_LEVEL_PRIMARY, + .commandBufferCount = MPVK_MAX_CMDS, + }; + + VkCommandBuffer cmdbufs[MPVK_MAX_CMDS]; + VK(vkAllocateCommandBuffers(vk->dev, &ainfo, cmdbufs)); + + for (int n = 0; n < MPVK_MAX_CMDS; n++) { + struct vk_cmd *cmd = &pool->cmds[n]; + cmd->pool = pool; + cmd->buf = cmdbufs[n]; + + VkFenceCreateInfo finfo = { + .sType = VK_STRUCTURE_TYPE_FENCE_CREATE_INFO, + .flags = VK_FENCE_CREATE_SIGNALED_BIT, + }; + + VK(vkCreateFence(vk->dev, &finfo, MPVK_ALLOCATOR, &cmd->fence)); + + VkSemaphoreCreateInfo sinfo = { + .sType = VK_STRUCTURE_TYPE_SEMAPHORE_CREATE_INFO, + }; + + VK(vkCreateSemaphore(vk->dev, &sinfo, MPVK_ALLOCATOR, &cmd->done)); + } + + // Ensure we can actually present to the surface using this queue + VkBool32 sup; + VK(vkGetPhysicalDeviceSurfaceSupportKHR(vk->physd, pool->qf, vk->surf, &sup)); + if (!sup) { + MP_ERR(vk, "Queue family does not support surface presentation!\n"); + goto error; + } + + return true; + +error: + MP_ERR(vk, "Failed creating logical device!\n"); + talloc_free(qfs); + return false; +} + +static void run_callbacks(struct vk_cmd *cmd) +{ + for (int i = 0; i < cmd->num_callbacks; i++) { + struct vk_callback *cb = &cmd->callbacks[i]; + cb->run(cb->priv, cb->arg); + *cb = (struct vk_callback){0}; + } + + cmd->num_callbacks = 0; +} + +static void wait_for_cmds(struct mpvk_ctx *vk, struct vk_cmd cmds[], int num) +{ + if (!num) + return; + + VkFence fences[MPVK_MAX_CMDS]; + for (int i = 0; i < num; i++) + fences[i] = cmds[i].fence; + + vkWaitForFences(vk->dev, num, fences, true, UINT64_MAX); + + for (int i = 0; i < num; i++) + run_callbacks(&cmds[i]); +} + +void mpvk_wait_idle(struct mpvk_ctx *vk) +{ + struct vk_cmdpool *pool = vk->pool; + + int idx = pool->cindex, pidx = pool->cindex_pending; + if (pidx < idx) { // range doesn't wrap + wait_for_cmds(vk, &pool->cmds[pidx], idx - pidx); + } else if (pidx > idx) { // range wraps + wait_for_cmds(vk, &pool->cmds[pidx], MPVK_MAX_CMDS - pidx); + wait_for_cmds(vk, &pool->cmds[0], idx); + } + pool->cindex_pending = pool->cindex; +} + +void mpvk_poll_cmds(struct mpvk_ctx *vk, struct vk_cmdpool *pool, + uint64_t timeout) +{ + // If requested, hard block until at least one command completes + if (timeout > 0 && pool->cindex_pending != pool->cindex) { + vkWaitForFences(vk->dev, 1, &pool->cmds[pool->cindex_pending].fence, + true, timeout); + } + + // Lazily garbage collect the commands based on their status + while (pool->cindex_pending != pool->cindex) { + struct vk_cmd *cmd = &pool->cmds[pool->cindex_pending]; + VkResult res = vkGetFenceStatus(vk->dev, cmd->fence); + if (res != VK_SUCCESS) + break; + run_callbacks(cmd); + pool->cindex_pending++; + pool->cindex_pending %= MPVK_MAX_CMDS; + } +} + +void vk_dev_callback(struct mpvk_ctx *vk, vk_cb callback, void *p, void *arg) +{ + struct vk_cmdpool *pool = vk->pool; + if (pool->cindex_pending == pool->cindex) { + // The device was already idle, so we can just immediately call it + callback(p, arg); + return; + } + + int prev_idx = pool->cindex - 1; + if (prev_idx < 0) + prev_idx += MPVK_MAX_CMDS; + + struct vk_cmd *last_cmd = &pool->cmds[prev_idx]; + vk_cmd_callback(last_cmd, callback, p, arg); +} + +const VkImageSubresourceRange vk_range = { + .aspectMask = VK_IMAGE_ASPECT_COLOR_BIT, + .levelCount = 1, + .layerCount = 1, +}; + +const VkImageSubresourceLayers vk_layers = { + .aspectMask = VK_IMAGE_ASPECT_COLOR_BIT, + .layerCount = 1, +}; + +void vk_cmd_callback(struct vk_cmd *cmd, vk_cb callback, void *p, void *arg) +{ + MP_TARRAY_GROW(NULL, cmd->callbacks, cmd->num_callbacks); + cmd->callbacks[cmd->num_callbacks++] = (struct vk_callback) { + .run = callback, + .priv = p, + .arg = arg, + }; +} + +void vk_cmd_dep(struct vk_cmd *cmd, VkSemaphore dep, + VkPipelineStageFlagBits depstage) +{ + assert(cmd->num_deps < MPVK_MAX_CMD_DEPS); + cmd->deps[cmd->num_deps] = dep; + cmd->depstages[cmd->num_deps++] = depstage; +} + +struct vk_cmd *vk_cmd_begin(struct mpvk_ctx *vk, struct vk_cmdpool *pool) +{ + // Garbage collect the cmdpool first + mpvk_poll_cmds(vk, pool, 0); + + int next = (pool->cindex + 1) % MPVK_MAX_CMDS; + if (next == pool->cindex_pending) { + MP_ERR(vk, "No free command buffers!\n"); + goto error; + } + + struct vk_cmd *cmd = &pool->cmds[pool->cindex]; + pool->cindex = next; + + VK(vkResetCommandBuffer(cmd->buf, 0)); + + VkCommandBufferBeginInfo binfo = { + .sType = VK_STRUCTURE_TYPE_COMMAND_BUFFER_BEGIN_INFO, + .flags = VK_COMMAND_BUFFER_USAGE_ONE_TIME_SUBMIT_BIT, + }; + + VK(vkBeginCommandBuffer(cmd->buf, &binfo)); + + return cmd; + +error: + return NULL; +} + +bool vk_cmd_submit(struct mpvk_ctx *vk, struct vk_cmd *cmd, VkSemaphore *done) +{ + VK(vkEndCommandBuffer(cmd->buf)); + + struct vk_cmdpool *pool = cmd->pool; + VkQueue queue = pool->queues[pool->qindex++]; + pool->qindex %= pool->qcount; + + VkSubmitInfo sinfo = { + .sType = VK_STRUCTURE_TYPE_SUBMIT_INFO, + .commandBufferCount = 1, + .pCommandBuffers = &cmd->buf, + .waitSemaphoreCount = cmd->num_deps, + .pWaitSemaphores = cmd->deps, + .pWaitDstStageMask = cmd->depstages, + }; + + if (done) { + sinfo.signalSemaphoreCount = 1; + sinfo.pSignalSemaphores = &cmd->done; + *done = cmd->done; + } + + VK(vkResetFences(vk->dev, 1, &cmd->fence)); + VK(vkQueueSubmit(queue, 1, &sinfo, cmd->fence)); + MP_TRACE(vk, "Submitted command on queue %p\n", (void *)queue); + + for (int i = 0; i < cmd->num_deps; i++) + cmd->deps[i] = NULL; + cmd->num_deps = 0; + + return true; + +error: + return false; +} + +static bool vk_swchain_update_info(struct vk_swchain *chain, + VkSwapchainCreateInfoKHR *info) +{ + struct mpvk_ctx *vk = chain->vk; + + // Query the supported capabilities and update this struct as needed + VkSurfaceCapabilitiesKHR caps; + VK(vkGetPhysicalDeviceSurfaceCapabilitiesKHR(vk->physd, vk->surf, &caps)); + + // Sorted by preference + static const VkCompositeAlphaFlagBitsKHR alphaModes[] = { + VK_COMPOSITE_ALPHA_POST_MULTIPLIED_BIT_KHR, + VK_COMPOSITE_ALPHA_OPAQUE_BIT_KHR, + }; + + for (int i = 0; i < MP_ARRAY_SIZE(alphaModes); i++) { + if (caps.supportedCompositeAlpha & alphaModes[i]) { + info->compositeAlpha = alphaModes[i]; + break; + } + } + + if (!info->compositeAlpha) { + MP_ERR(vk, "Failed picking alpha compositing mode (caps: %d)\n", + caps.supportedCompositeAlpha); + goto error; + } + + static const VkSurfaceTransformFlagBitsKHR rotModes[] = { + VK_SURFACE_TRANSFORM_IDENTITY_BIT_KHR, + VK_SURFACE_TRANSFORM_INHERIT_BIT_KHR, + }; + + for (int i = 0; i < MP_ARRAY_SIZE(rotModes); i++) { + if (caps.supportedTransforms & rotModes[i]) { + info->preTransform = rotModes[i]; + break; + } + } + + if (!info->preTransform) { + MP_ERR(vk, "Failed picking surface transform mode (caps: %d)\n", + caps.supportedTransforms); + goto error; + } + + // Image count as required + info->minImageCount = MPMAX(info->minImageCount, caps.minImageCount); + if (caps.maxImageCount) + info->minImageCount = MPMIN(info->minImageCount, caps.maxImageCount); + + // Check the extend against the allowed parameters + if (caps.currentExtent.width != info->imageExtent.width && + caps.currentExtent.width != 0xFFFFFFFF) + { + MP_WARN(vk, "Requested width %d does not match current width %d\n", + info->imageExtent.width, caps.currentExtent.width); + info->imageExtent.width = caps.currentExtent.width; + } + + if (caps.currentExtent.height != info->imageExtent.height && + caps.currentExtent.height != 0xFFFFFFFF) + { + MP_WARN(vk, "Requested height %d does not match current height %d\n", + info->imageExtent.height, caps.currentExtent.height); + info->imageExtent.height = caps.currentExtent.height; + } + + if (caps.minImageExtent.width > info->imageExtent.width || + caps.minImageExtent.height > info->imageExtent.height) + { + MP_ERR(vk, "Requested size %dx%d smaller than device minimum %d%d\n", + info->imageExtent.width, info->imageExtent.height, + caps.minImageExtent.width, caps.minImageExtent.height); + goto error; + } + + if (caps.maxImageExtent.width < info->imageExtent.width || + caps.maxImageExtent.height < info->imageExtent.height) + { + MP_ERR(vk, "Requested size %dx%d larger than device maximum %d%d\n", + info->imageExtent.width, info->imageExtent.height, + caps.maxImageExtent.width, caps.maxImageExtent.height); + goto error; + } + + // We just request whatever usage we can, and let the ra_vk decide what + // ra_tex_params that translates to. This makes the images as flexible + // as possible. + info->imageUsage = caps.supportedUsageFlags; + return true; + +error: + return false; +} + +bool vk_swchain_init(struct mpvk_ctx *vk, struct ra *ra, int size, + struct vk_swchain *chain) +{ + assert(vk->dev); + assert(vk->surf_format.format); + + struct VkSwapchainCreateInfoKHR dummy = { + .sType = VK_STRUCTURE_TYPE_SWAPCHAIN_CREATE_INFO_KHR, + .surface = vk->surf, + .minImageCount = size, + .imageFormat = vk->surf_format.format, + .imageColorSpace = vk->surf_format.colorSpace, + .imageArrayLayers = 1, // non-stereoscopic + .imageSharingMode = VK_SHARING_MODE_EXCLUSIVE, + .presentMode = VK_PRESENT_MODE_FIFO_KHR, + .clipped = true, + }; + + *chain = (struct vk_swchain) { + .vk = vk, + .ra = ra, + .protoInfo = dummy, + }; + + return true; +} + +void vk_swchain_uninit(struct ra *ra, struct vk_swchain *chain) +{ + struct mpvk_ctx *vk = chain->vk; + if (!vk) + return; + + // Note: We technically don't even need the struct *ra, it's just there + // to "force" the correct uninitialization order at the API level. Either + // way, make sure the RA actually matches.. + assert(ra == chain->ra); + + mpvk_wait_idle(vk); + + for (int i = 0; i < chain->num_images; i++) + ra_tex_free(ra, &chain->images[i]); + for (int i = 0; i < chain->num_acquired; i++) + vkDestroySemaphore(vk->dev, chain->acquired[i], MPVK_ALLOCATOR); + + vkDestroySwapchainKHR(vk->dev, chain->swchain, MPVK_ALLOCATOR); + + talloc_free(chain->images); + talloc_free(chain->acquired); + *chain = (struct vk_swchain){0}; +} + +static void destroy_swapchain(struct mpvk_ctx *vk, VkSwapchainKHR swchain) +{ + vkDestroySwapchainKHR(vk->dev, swchain, MPVK_ALLOCATOR); +} + +bool vk_swchain_resize(struct vk_swchain *chain, int w, int h) +{ + if (w == chain->w && h == chain->h) + return true; + + struct mpvk_ctx *vk = chain->vk; + VkImage *vkimages = NULL; + bool ret = false; + + VkSwapchainCreateInfoKHR sinfo = chain->protoInfo; + sinfo.imageExtent = (VkExtent2D){ w, h }; + sinfo.oldSwapchain = chain->swchain; + + if (!vk_swchain_update_info(chain, &sinfo)) + goto error; + + VK(vkCreateSwapchainKHR(vk->dev, &sinfo, MPVK_ALLOCATOR, &chain->swchain)); + chain->w = w; + chain->h = h; + + // Freeing the old swapchain while it's still in use is an error, so do + // it asynchronously once the device is idle. + if (sinfo.oldSwapchain) + vk_dev_callback(vk, (vk_cb) destroy_swapchain, vk, sinfo.oldSwapchain); + + // Get the new swapchain images + int num; + VK(vkGetSwapchainImagesKHR(vk->dev, chain->swchain, &num, NULL)); + vkimages = talloc_array(NULL, VkImage, num); + VK(vkGetSwapchainImagesKHR(vk->dev, chain->swchain, &num, vkimages)); + + // If needed, allocate some more semaphores + while (num > chain->num_acquired) { + VkSemaphore sem; + static const VkSemaphoreCreateInfo seminfo = { + .sType = VK_STRUCTURE_TYPE_SEMAPHORE_CREATE_INFO, + }; + VK(vkCreateSemaphore(vk->dev, &seminfo, MPVK_ALLOCATOR, &sem)); + MP_TARRAY_APPEND(NULL, chain->acquired, chain->num_acquired, sem); + } + + // Recreate the ra_tex wrappers + for (int i = 0; i < chain->num_images; i++) + ra_tex_free(chain->ra, &chain->images[i]); + + chain->num_images = num; + MP_TARRAY_GROW(NULL, chain->images, chain->num_images); + for (int i = 0; i < num; i++) { + chain->images[i] = ra_vk_wrap_swchain_img(chain->ra, vkimages[i], sinfo); + if (!chain->images[i]) + goto error; + } + + ret = true; + +error: + talloc_free(vkimages); + return ret; +} + +bool vk_swchain_get(struct vk_swchain *chain, struct vk_swimg *out) +{ + struct mpvk_ctx *vk = chain->vk; + + int semidx = chain->idx_acquired++; + chain->idx_acquired %= chain->num_acquired; + + uint32_t imgidx = 0; + VK(vkAcquireNextImageKHR(vk->dev, chain->swchain, UINT64_MAX, + chain->acquired[semidx], NULL, &imgidx)); + + *out = (struct vk_swimg) { + .chain = chain, + .index = imgidx, + .image = chain->images[imgidx], + .acquired = chain->acquired[semidx], + }; + return true; + +error: + return false; +} diff --git a/video/out/vulkan/utils.h b/video/out/vulkan/utils.h new file mode 100644 index 0000000000000..6273ebca95ef4 --- /dev/null +++ b/video/out/vulkan/utils.h @@ -0,0 +1,178 @@ +#pragma once + +#include "video/out/vo.h" +#include "video/mp_image.h" + +#include "common.h" +#include "formats.h" + +#define VK_LOAD_PFN(name) PFN_##name pfn_##name = (PFN_##name) \ + vkGetInstanceProcAddr(vk->inst, #name); + +// Return a human-readable name for various struct mpvk_ctx enums +const char* vk_err(VkResult res); + +// Convenience macros to simplify a lot of common boilerplate +#define VK_ASSERT(res, str) \ + if (res != VK_SUCCESS) { \ + MP_ERR(vk, str ": %s\n", vk_err(res)); \ + goto error; \ + } + +#define VK(cmd) \ + { \ + MP_TRACE(vk, #cmd "\n"); \ + VkResult res ## __LINE__ = (cmd); \ + VK_ASSERT(res ## __LINE__, #cmd); \ + } + +// Uninits everything in the correct order +void mpvk_uninit(struct mpvk_ctx *vk); + +// Initialization functions: As a rule of thumb, these need to be called in +// this order, followed by vk_malloc_init, followed by RA initialization, and +// finally followed by vk_swchain initialization. + +// Create a vulkan instance. Returns VK_NULL_HANDLE on failure +bool mpvk_instance_init(struct mpvk_ctx *vk, bool validate); + +// Generate a VkSurfaceKHR usable for video output. Returns VK_NULL_HANDLE on +// failure. Must be called after mpvk_instance_init. +bool mpvk_surface_init(struct vo *vo, struct mpvk_ctx *vk); + +// Find a suitable physical device for use with rendering and which supports +// the surface. +// name: only match a device with this name +// sw: also allow software/virtual devices +bool mpvk_find_phys_device(struct mpvk_ctx *vk, const char *name, bool sw); + +// Pick a suitable surface format that's supported by this physical device. +bool mpvk_pick_surface_format(struct mpvk_ctx *vk); + +// Create a logical device and initialize the vk_cmdpools +bool mpvk_device_init(struct mpvk_ctx *vk); + +// Wait until all commands submitted to all queues have completed +void mpvk_wait_idle(struct mpvk_ctx *vk); + +// Wait until at least one command submitted to any queue has completed, and +// process the callbacks. Good for event loops that need to delay until a +// command completes. Will block at most `timeout` nanoseconds. If used with +// 0, it only garbage collects completed commands without blocking. +void mpvk_poll_cmds(struct mpvk_ctx *vk, struct vk_cmdpool *pool, + uint64_t timeout); + +// Predefined structs for a simple non-layered, non-mipped image +extern const VkImageSubresourceRange vk_range; +extern const VkImageSubresourceLayers vk_layers; + +// Since lots of vulkan operations need to be done lazily once the affected +// resources are no longer in use, provide an abstraction for tracking these. +// In practice, these are only checked and run when submitting new commands, so +// the actual execution may be delayed by a frame. +typedef void (*vk_cb)(void *priv, void *arg); + +struct vk_callback { + vk_cb run; + void *priv; + void *arg; // as a convenience, you also get to pass an arg for "free" +}; + +// Associate a callback with the completion of all currently pending commands. +// This will essentially run once the device is completely idle. +void vk_dev_callback(struct mpvk_ctx *vk, vk_cb callback, void *p, void *arg); + +#define MPVK_MAX_CMD_DEPS 8 + +// Helper wrapper around command buffers that also track dependencies, +// callbacks and synchronization primitives +struct vk_cmd { + struct vk_cmdpool *pool; // pool it was allocated from + VkCommandBuffer buf; + VkFence fence; // the fence guards cmd buffer reuse + VkSemaphore done; // the semaphore signals when execution is done + // The semaphores represent dependencies that need to complete before + // this command can be executed. These are *not* owned by the vk_cmd + VkSemaphore deps[MPVK_MAX_CMD_DEPS]; + VkPipelineStageFlags depstages[MPVK_MAX_CMD_DEPS]; + int num_deps; + // Since VkFences are useless, we have to manually track "callbacks" + // to fire once the VkFence completes. These are used for multiple purposes, + // ranging from garbage collection (resource deallocation) to fencing. + struct vk_callback *callbacks; + int num_callbacks; +}; + +// Associate a callback with the completion of the current command. This +// bool will be set to `true` once the command completes, or shortly thereafter. +void vk_cmd_callback(struct vk_cmd *cmd, vk_cb callback, void *p, void *arg); + +// Associate a dependency for the current command. This semaphore must signal +// by the corresponding stage before the command may execute. +void vk_cmd_dep(struct vk_cmd *cmd, VkSemaphore dep, + VkPipelineStageFlagBits depstage); + +#define MPVK_MAX_QUEUES 8 +#define MPVK_MAX_CMDS 16 + +// Command pool / queue family hybrid abstraction +struct vk_cmdpool { + VkQueueFamilyProperties props; + uint32_t qf; // queue family index + VkCommandPool pool; + VkQueue queues[MPVK_MAX_QUEUES]; + int qcount; + int qindex; + // Command buffers associated with this queue. (No, VkCommandPool is not + // a pool of command buffers), you still have to pool them manually. We + // also have to track of "in flight" (pending) command buffers separately + // to work around vkQueueWaitIdle being completely fucking useless when + // using a queue for presentation. + struct vk_cmd cmds[MPVK_MAX_CMDS]; + int cindex; + int cindex_pending; +}; + +// Fetch the next command buffer from a command pool and begin recording to it. +// Returns NULL on failure. +struct vk_cmd *vk_cmd_begin(struct mpvk_ctx *vk, struct vk_cmdpool *pool); + +// Finish the currently recording command buffer and submit it for execution. +// If `done` is not NULL, it will be set to a semaphore that will signal once +// the command completes. (And MUST have a corresponding semaphore wait) +// Returns whether successful. +bool vk_cmd_submit(struct mpvk_ctx *vk, struct vk_cmd *cmd, VkSemaphore *done); + +// Swapchain +struct vk_swchain { + struct mpvk_ctx *vk; + struct ra *ra; + int w, h; // current size + VkSwapchainCreateInfoKHR protoInfo; // partially filled-in prototype + VkSwapchainKHR swchain; + // state of the images: + struct ra_tex **images; // ra_tex wrappers for the vkimages + int num_images; // size of images + VkSemaphore *acquired; // pool of semaphores used to synchronize images + int num_acquired; // size of this pool + int idx_acquired; // index of next free semaphore within this pool +}; + +// depth: desired depth +bool vk_swchain_init(struct mpvk_ctx *vk, struct ra *ra, int depth, + struct vk_swchain *chain); +void vk_swchain_uninit(struct ra *ra, struct vk_swchain *chain); +bool vk_swchain_resize(struct vk_swchain *chain, int w, int h); + +// Swapchain image +struct vk_swimg { + struct vk_swchain *chain; // vk_swchain it was allocated from + int index; // index within that vk_swchain + struct ra_tex *image; // ra_tex wrapper for the this image + VkSemaphore acquired; // will be signalled once the image is ready +}; + +// Get the next vk_swimg. This may block if the swapchain images are exceeded, +// but normally the user should allocate a larger swapchain than what they +// actually use. +bool vk_swchain_get(struct vk_swchain *chain, struct vk_swimg *out); diff --git a/wscript b/wscript index 9d885884d7476..42e5e726650da 100644 --- a/wscript +++ b/wscript @@ -780,6 +780,16 @@ video_output_features = [ 'fmsg': "No OpenGL video output found or enabled. " + "Aborting. If you really mean to compile without OpenGL " + "video outputs use --disable-gl." + }, { + 'name': '--vulkan-xlib', + 'desc': 'Vulkan Xlib backend', + 'func': check_true, + 'deps': ['x11'], + }, { + 'name': '--vulkan', + 'desc': 'Vulkan video output', + 'deps_any': [ 'vulkan-xlib' ], + 'func': check_cc(header_name='vulkan/vulkan.h', lib='vulkan'), }, { 'name': 'egl-helpers', 'desc': 'EGL helper functions', diff --git a/wscript_build.py b/wscript_build.py index 3c5c00dc6415e..878b1faf02513 100644 --- a/wscript_build.py +++ b/wscript_build.py @@ -437,12 +437,17 @@ def build(ctx): ( "video/out/vo_tct.c" ), ( "video/out/vo_vaapi.c", "vaapi-x11" ), ( "video/out/vo_vdpau.c", "vdpau" ), + ( "video/out/vo_vulkan.c", "vulkan" ), ( "video/out/vo_wayland.c", "wayland" ), ( "video/out/vo_x11.c" , "x11" ), ( "video/out/vo_xv.c", "xv" ), ( "video/out/w32_common.c", "win32-desktop" ), ( "video/out/win32/displayconfig.c", "win32-desktop" ), ( "video/out/win32/droptarget.c", "win32-desktop" ), + ( "video/out/vulkan/utils.c", "vulkan" ), + ( "video/out/vulkan/malloc.c", "vulkan" ), + ( "video/out/vulkan/formats.c", "vulkan" ), + ( "video/out/vulkan/ra_vk.c", "vulkan" ), ( "video/out/win32/exclusive_hack.c", "gl-win32" ), ( "video/out/wayland_common.c", "wayland" ), ( "video/out/wayland/buffer.c", "wayland" ),