diff --git a/video/out/opengl/ra.h b/video/out/opengl/ra.h index ae7fb9aea730a..1f716d98f8bdc 100644 --- a/video/out/opengl/ra.h +++ b/video/out/opengl/ra.h @@ -146,6 +146,7 @@ enum ra_buf_type { RA_BUF_TYPE_TEX_UPLOAD, // texture upload buffer (pixel buffer object) RA_BUF_TYPE_SHADER_STORAGE, // shader buffer (SSBO), for RA_VARTYPE_BUF_RW RA_BUF_TYPE_UNIFORM, // uniform buffer (UBO), for RA_VARTYPE_BUF_RO + RA_BUF_TYPE_VERTEX, // not publicly usable (RA-internal usage) }; struct ra_buf_params { @@ -369,10 +370,10 @@ struct ra_fns { void (*buf_destroy)(struct ra *ra, struct ra_buf *buf); - // Update the contents of a buffer, starting at a given offset and up to a - // given size, with the contents of *data. This is an extremely common - // operation. Calling this while the buffer is considered "in use" is an - // error. (See: buf_poll) + // Update the contents of a buffer, starting at a given offset (*must* be a + // multiple of 4) and up to a given size, with the contents of *data. This + // is an extremely common operation. Calling this while the buffer is + // considered "in use" is an error. (See: buf_poll) void (*buf_update)(struct ra *ra, struct ra_buf *buf, ptrdiff_t offset, const void *data, size_t size); diff --git a/video/out/opengl/utils.c b/video/out/opengl/utils.c index b8fc24a52e133..aeadd346b9408 100644 --- a/video/out/opengl/utils.c +++ b/video/out/opengl/utils.c @@ -64,7 +64,8 @@ static bool ra_buf_pool_grow(struct ra *ra, struct ra_buf_pool *pool) return false; MP_TARRAY_INSERT_AT(NULL, pool->buffers, pool->num_buffers, pool->index, buf); - MP_VERBOSE(ra, "Resized buffer pool to size %d\n", pool->num_buffers); + MP_VERBOSE(ra, "Resized buffer pool of type %u to size %d\n", + pool->current_params.type, pool->num_buffers); return true; } diff --git a/video/out/vo.c b/video/out/vo.c index f9c5d04e24be0..06507c7f87694 100644 --- a/video/out/vo.c +++ b/video/out/vo.c @@ -60,6 +60,7 @@ extern const struct vo_driver video_out_drm; extern const struct vo_driver video_out_direct3d; extern const struct vo_driver video_out_sdl; extern const struct vo_driver video_out_vaapi; +extern const struct vo_driver video_out_vulkan; extern const struct vo_driver video_out_wayland; extern const struct vo_driver video_out_rpi; extern const struct vo_driver video_out_tct; @@ -78,6 +79,9 @@ const struct vo_driver *const video_out_drivers[] = #if HAVE_DIRECT3D &video_out_direct3d, #endif +#if HAVE_VULKAN + &video_out_vulkan, +#endif #if HAVE_WAYLAND &video_out_wayland, #endif diff --git a/video/out/vo_vulkan.c b/video/out/vo_vulkan.c new file mode 100644 index 0000000000000..9e6c7984c6a3d --- /dev/null +++ b/video/out/vo_vulkan.c @@ -0,0 +1,335 @@ +/* + * This file is part of mpv. + * + * mpv is free software; you can redistribute it and/or + * modify it under the terms of the GNU Lesser General Public + * License as published by the Free Software Foundation; either + * version 2.1 of the License, or (at your option) any later version. + * + * mpv is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + * GNU Lesser General Public License for more details. + * + * You should have received a copy of the GNU Lesser General Public + * License along with mpv. If not, see . + */ + +#include "mpv_talloc.h" +#include "options/m_config.h" +#include "osdep/timer.h" +#include "video/mp_image.h" +#include "video/out/x11_common.h" +#include "vo.h" +#include "sub/osd.h" + +#include "opengl/ra.h" +#include "opengl/video.h" + +#include "vulkan/common.h" +#include "vulkan/utils.h" +#include "vulkan/ra_vk.h" + +struct vo_vulkan_opts { + int debug; // whether to load the validation layers or not + int allow_sw; // whether to allow software devices + char *device; // force a specific GPU + int swsize; // swapchain size + int swdepth; // swapchain depth +}; + +struct vk_priv { + struct vo *vo; + struct mp_log *log; + + struct vo_vulkan_opts opts; + + struct mpvk_ctx vk; + struct ra *ra; + struct gl_video *renderer; + + struct vk_swchain swchain; + int frames_in_flight; +}; + +static bool resize(struct vo *vo) +{ + struct vk_priv *p = vo->priv; + + MP_VERBOSE(vo, "Resize: %dx%d\n", vo->dwidth, vo->dheight); + + if (!vk_swchain_resize(&p->swchain, vo->dwidth, vo->dheight)) { + MP_ERR(vo, "Failed resizing swapchain!\n"); + return false; + } + + struct mp_rect src, dst; + struct mp_osd_res osd; + vo_get_src_dst_rects(vo, &src, &dst, &osd); + + gl_video_resize(p->renderer, &src, &dst, &osd); + + vo->want_redraw = true; + return true; +} + +static int reconfig(struct vo *vo, struct mp_image_params *params) +{ + struct vk_priv *p = vo->priv; + + if (vo->x11) + vo_x11_config_vo_window(vo); + + if (!resize(vo)) + return VO_ERROR; + + gl_video_config(p->renderer, params); + + return 0; +} + +static void uninit(struct vo *vo) +{ + struct vk_priv *p = vo->priv; + struct mpvk_ctx *vk = &p->vk; + + gl_video_uninit(p->renderer); + + if (p->ra) { + vk_swchain_uninit(p->ra, &p->swchain); + p->ra->fns->destroy(p->ra); + } + + // Clean up platform-specific windowing stuff. Do this first to prevent + // keeping around the window for long, then we can uninit the device etc. + // afterwards + if (vo->x11) + vo_x11_uninit(vo); + + mpvk_uninit(vk); +} + +static int preinit(struct vo *vo) +{ + struct vk_priv *p = vo->priv; + struct mpvk_ctx *vk = &p->vk; + p->vo = vo; + p->log = vk->log = vo->log; + + if (!mpvk_instance_init(vk, p->opts.debug)) + goto error; + if (!mpvk_surface_init(vo, vk)) + goto error; + if (!mpvk_find_phys_device(vk, p->opts.device, p->opts.allow_sw)) + goto error; + if (!mpvk_pick_surface_format(vk)) + goto error; + if (!mpvk_device_init(vk)) + goto error; + p->ra = ra_create_vk(vk, p->log); + if (!p->ra) + goto error; + if (!vk_swchain_init(vk, p->ra, p->opts.swsize, &p->swchain)) + goto error; + + p->renderer = gl_video_init(p->ra, vo->log, vo->global); + gl_video_set_osd_source(p->renderer, vo->osd); + gl_video_configure_queue(p->renderer, vo); + + return 0; + +error: + uninit(vo); + return -1; +} + +static int control(struct vo *vo, uint32_t request, void *data) +{ + struct vk_priv *p = vo->priv; + + switch (request) { + case VOCTRL_SET_PANSCAN: + return resize(vo) ? VO_TRUE : VO_ERROR; + case VOCTRL_SET_EQUALIZER: + vo->want_redraw = true; + return VO_TRUE; + case VOCTRL_UPDATE_RENDER_OPTS: { + gl_video_update_options(p->renderer); + gl_video_configure_queue(p->renderer, p->vo); + p->vo->want_redraw = true; + return true; + } + case VOCTRL_RESET: + gl_video_reset(p->renderer); + return true; + case VOCTRL_PAUSE: + if (gl_video_showing_interpolated_frame(p->renderer)) + vo->want_redraw = true; + return true; + case VOCTRL_PERFORMANCE_DATA: + gl_video_perfdata(p->renderer, (struct voctrl_performance_data *)data); + return true; + } + + int events = 0, r = 0; + + if (vo->x11) + r |= vo_x11_control(vo, &events, request, data); + + if (events & VO_EVENT_RESIZE) + r |= resize(vo) ? 0 : VO_ERROR; + + if (events & VO_EVENT_EXPOSE) + vo->want_redraw = true; + + vo_event(vo, events); + return r; +} + +static void draw_frame(struct vo *vo, struct vo_frame *frame) +{ + struct vk_priv *p = vo->priv; + struct vk_swimg swimg; + if (!vk_swchain_get(&p->swchain, &swimg)) + goto error; + + struct fbodst target = { + .tex = swimg.image, + .flip = false, + }; + + gl_video_render_frame(p->renderer, frame, target); + if (!ra_vk_present_frame(p->ra, &swimg, &p->frames_in_flight)) { + MP_ERR(vo, "Failed presenting frame!\n"); + goto error; + } + +error: + return; +} + +static void flip_page(struct vo *vo) +{ + struct vk_priv *p = vo->priv; + while (p->frames_in_flight >= p->opts.swdepth) + mpvk_poll_cmds(&p->vk, p->vk.pool, UINT64_MAX); +} + +static int query_format(struct vo *vo, int format) +{ + struct vk_priv *p = vo->priv; + if (!gl_video_check_format(p->renderer, format)) + return 0; + return 1; +} + +static void wakeup(struct vo *vo) +{ + if (vo->x11) + vo_x11_wakeup(vo); +} + +static void wait_events(struct vo *vo, int64_t until_time_us) +{ + if (vo->x11) { + vo_x11_wait_events(vo, until_time_us); + } else { + vo_wait_default(vo, until_time_us); + } +} + +static struct mp_image *get_image(struct vo *vo, int imgfmt, int w, int h, + int stride_align) +{ + struct vk_priv *p = vo->priv; + return gl_video_get_image(p->renderer, imgfmt, w, h, stride_align); +} + +static int vk_validate_dev(struct mp_log *log, const struct m_option *opt, + struct bstr name, struct bstr param) +{ + int ret = M_OPT_INVALID; + VkResult res; + + // Create a dummy instance to validate/list the devices + VkInstanceCreateInfo info = { + .sType = VK_STRUCTURE_TYPE_INSTANCE_CREATE_INFO, + }; + + VkInstance inst; + VkPhysicalDevice *devices = NULL; + uint32_t num = 0; + + res = vkCreateInstance(&info, MPVK_ALLOCATOR, &inst); + if (res != VK_SUCCESS) + goto error; + + res = vkEnumeratePhysicalDevices(inst, &num, NULL); + if (res != VK_SUCCESS) + goto error; + + devices = talloc_array(NULL, VkPhysicalDevice, num); + vkEnumeratePhysicalDevices(inst, &num, devices); + if (res != VK_SUCCESS) + goto error; + + bool help = bstr_equals0(param, "help"); + if (help) { + mp_info(log, "Available vulkan devices:\n"); + ret = M_OPT_EXIT; + } + + for (int i = 0; i < num; i++) { + VkPhysicalDeviceProperties prop; + vkGetPhysicalDeviceProperties(devices[i], &prop); + + if (help) { + mp_info(log, " '%s' (GPU %d, ID %x:%x)\n", prop.deviceName, i, + prop.vendorID, prop.deviceID); + } else if (bstr_equals0(param, prop.deviceName)) { + ret = 0; + break; + } + } + + if (!help) + mp_err(log, "No device with name '%.*s'!\n", BSTR_P(param)); + +error: + talloc_free(devices); + return ret; +} + +#define OPT_BASE_STRUCT struct vk_priv + +const struct vo_driver video_out_vulkan = { + .description = "Vulkan Renderer", + .name = "vulkan", + .preinit = preinit, + .query_format = query_format, + .reconfig = reconfig, + .control = control, + .get_image = get_image, + .draw_frame = draw_frame, + .flip_page = flip_page, + .wait_events = wait_events, + .wakeup = wakeup, + .uninit = uninit, + .priv_size = sizeof(struct vk_priv), + .options = (const m_option_t[]) { + OPT_FLAG("vulkan-debug", opts.debug, 0), + OPT_FLAG("vulkan-sw", opts.allow_sw, 0), + OPT_STRING_VALIDATE("vulkan-device", opts.device, 0, vk_validate_dev), + OPT_INTRANGE("vulkan-swapchain-size", opts.swsize, 0, 1, + MPVK_MAX_STREAMING_DEPTH), + OPT_INTRANGE("vulkan-swapchain-depth", opts.swdepth, 0, 1, + MPVK_MAX_STREAMING_DEPTH), + {0} + }, + .priv_defaults = &(const struct vk_priv) { + .opts = { + .swsize = 8, + .swdepth = 1, + }, + }, +}; diff --git a/video/out/vulkan/common.h b/video/out/vulkan/common.h new file mode 100644 index 0000000000000..9113d27a6a201 --- /dev/null +++ b/video/out/vulkan/common.h @@ -0,0 +1,50 @@ +#pragma once + +#include +#include +#include +#include +#include + +#include "config.h" + +#include "common/common.h" +#include "common/msg.h" + +// We need to define all platforms we want to support. Since we have +// our own mechanism for checking this, we re-define the right symbols +#if HAVE_VULKAN_XLIB +#define VK_USE_PLATFORM_XLIB_KHR +#endif + +#include + +// Vulkan allows the optional use of a custom allocator. We don't need one but +// mark this parameter with a better name in case we ever decide to change this +// in the future. (And to make the code more readable) +#define MPVK_ALLOCATOR NULL + +// A lot of things depend on streaming resources across frames. Depending on +// how many frames we render ahead of time, we need to pick enough to avoid +// any conflicts, so make all of these tunable relative to this constant in +// order to centralize them. +#define MPVK_MAX_STREAMING_DEPTH 8 + +// Shared struct used to hold vulkan context information +struct mpvk_ctx { + struct mp_log *log; + VkInstance inst; + VkPhysicalDevice physd; + VkDebugReportCallbackEXT dbg; + VkDevice dev; + + // Surface, must be initialized fter the context itself + VkSurfaceKHR surf; + VkSurfaceFormatKHR surf_format; // picked at surface initialization time + + struct vk_malloc *alloc; // memory allocator for this device + struct vk_cmdpool *pool; // command pool for this device + + // Cached capabilities + VkPhysicalDeviceLimits limits; +}; diff --git a/video/out/vulkan/formats.c b/video/out/vulkan/formats.c new file mode 100644 index 0000000000000..b44bead99cc80 --- /dev/null +++ b/video/out/vulkan/formats.c @@ -0,0 +1,55 @@ +#include "formats.h" + +const struct vk_format vk_formats[] = { + // Regular, byte-aligned integer formats + {"r8", VK_FORMAT_R8_UNORM, 1, 1, {8 }, RA_CTYPE_UNORM }, + {"rg8", VK_FORMAT_R8G8_UNORM, 2, 2, {8, 8 }, RA_CTYPE_UNORM }, + {"rgb8", VK_FORMAT_R8G8B8_UNORM, 3, 3, {8, 8, 8 }, RA_CTYPE_UNORM }, + {"rgba8", VK_FORMAT_R8G8B8A8_UNORM, 4, 4, {8, 8, 8, 8 }, RA_CTYPE_UNORM }, + {"r16", VK_FORMAT_R16_UNORM, 1, 2, {16 }, RA_CTYPE_UNORM }, + {"rg16", VK_FORMAT_R16G16_UNORM, 2, 4, {16, 16 }, RA_CTYPE_UNORM }, + {"rgb16", VK_FORMAT_R16G16B16_UNORM, 3, 6, {16, 16, 16 }, RA_CTYPE_UNORM }, + {"rgba16", VK_FORMAT_R16G16B16A16_UNORM, 4, 8, {16, 16, 16, 16}, RA_CTYPE_UNORM }, + + // Special, integer-only formats + {"r32ui", VK_FORMAT_R32_UINT, 1, 4, {32 }, RA_CTYPE_UINT }, + {"rg32ui", VK_FORMAT_R32G32_UINT, 2, 8, {32, 32 }, RA_CTYPE_UINT }, + {"rgb32ui", VK_FORMAT_R32G32B32_UINT, 3, 12, {32, 32, 32 }, RA_CTYPE_UINT }, + {"rgba32ui", VK_FORMAT_R32G32B32A32_UINT, 4, 16, {32, 32, 32, 32}, RA_CTYPE_UINT }, + {"r64ui", VK_FORMAT_R64_UINT, 1, 8, {64 }, RA_CTYPE_UINT }, + {"rg64ui", VK_FORMAT_R64G64_UINT, 2, 16, {64, 64 }, RA_CTYPE_UINT }, + {"rgb64ui", VK_FORMAT_R64G64B64_UINT, 3, 24, {64, 64, 64 }, RA_CTYPE_UINT }, + {"rgba64ui", VK_FORMAT_R64G64B64A64_UINT, 4, 32, {64, 64, 64, 64}, RA_CTYPE_UINT }, + + // Packed integer formats + {"rg4", VK_FORMAT_R4G4_UNORM_PACK8, 2, 1, {4, 4 }, RA_CTYPE_UNORM }, + {"rgba4", VK_FORMAT_R4G4B4A4_UNORM_PACK16, 4, 2, {4, 4, 4, 4 }, RA_CTYPE_UNORM }, + {"rgb565", VK_FORMAT_R5G6B5_UNORM_PACK16, 3, 2, {5, 6, 5 }, RA_CTYPE_UNORM }, + {"rgb565a1", VK_FORMAT_R5G5B5A1_UNORM_PACK16, 4, 2, {5, 5, 5, 1 }, RA_CTYPE_UNORM }, + + // Float formats (native formats, hf = half float, df = double float) + {"r16hf", VK_FORMAT_R16_SFLOAT, 1, 2, {16 }, RA_CTYPE_FLOAT }, + {"rg16hf", VK_FORMAT_R16G16_SFLOAT, 2, 4, {16, 16 }, RA_CTYPE_FLOAT }, + {"rgb16hf", VK_FORMAT_R16G16B16_SFLOAT, 3, 6, {16, 16, 16 }, RA_CTYPE_FLOAT }, + {"rgba16hf", VK_FORMAT_R16G16B16A16_SFLOAT, 4, 8, {16, 16, 16, 16}, RA_CTYPE_FLOAT }, + {"r32f", VK_FORMAT_R32_SFLOAT, 1, 4, {32 }, RA_CTYPE_FLOAT }, + {"rg32f", VK_FORMAT_R32G32_SFLOAT, 2, 8, {32, 32 }, RA_CTYPE_FLOAT }, + {"rgb32f", VK_FORMAT_R32G32B32_SFLOAT, 3, 12, {32, 32, 32 }, RA_CTYPE_FLOAT }, + {"rgba32f", VK_FORMAT_R32G32B32A32_SFLOAT, 4, 16, {32, 32, 32, 32}, RA_CTYPE_FLOAT }, + {"r64df", VK_FORMAT_R64_SFLOAT, 1, 8, {64 }, RA_CTYPE_FLOAT }, + {"rg64df", VK_FORMAT_R64G64_SFLOAT, 2, 16, {64, 64 }, RA_CTYPE_FLOAT }, + {"rgb64df", VK_FORMAT_R64G64B64_SFLOAT, 3, 24, {64, 64, 64 }, RA_CTYPE_FLOAT }, + {"rgba64df", VK_FORMAT_R64G64B64A64_SFLOAT, 4, 32, {64, 64, 64, 64}, RA_CTYPE_FLOAT }, + + // "Swapped" component order images + {"bgr8", VK_FORMAT_B8G8R8_UNORM, 3, 3, {8, 8, 8 }, RA_CTYPE_UNORM, true }, + {"bgra8", VK_FORMAT_B8G8R8A8_UNORM, 4, 4, {8, 8, 8, 8 }, RA_CTYPE_UNORM, true }, + {"bgra4", VK_FORMAT_B4G4R4A4_UNORM_PACK16, 4, 2, {4, 4, 4, 4 }, RA_CTYPE_UNORM, true }, + {"bgr565", VK_FORMAT_B5G6R5_UNORM_PACK16, 3, 2, {5, 6, 5 }, RA_CTYPE_UNORM, true }, + {"bgr565a1", VK_FORMAT_B5G5R5A1_UNORM_PACK16, 4, 2, {5, 5, 5, 1 }, RA_CTYPE_UNORM, true }, + {"a1rgb5", VK_FORMAT_A1R5G5B5_UNORM_PACK16, 4, 2, {1, 5, 5, 5 }, RA_CTYPE_UNORM, true }, + {"a2rgb10", VK_FORMAT_A2R10G10B10_UNORM_PACK32, 4, 4, {2, 10, 10, 10}, RA_CTYPE_UNORM, true }, + {"a2bgr10", VK_FORMAT_A2B10G10R10_UNORM_PACK32, 4, 4, {2, 10, 10, 10}, RA_CTYPE_UNORM, true }, + {"abgr8", VK_FORMAT_A8B8G8R8_UNORM_PACK32, 4, 4, {8, 8, 8, 8 }, RA_CTYPE_UNORM, true }, + {0} +}; diff --git a/video/out/vulkan/formats.h b/video/out/vulkan/formats.h new file mode 100644 index 0000000000000..e57275a153a12 --- /dev/null +++ b/video/out/vulkan/formats.h @@ -0,0 +1,16 @@ +#pragma once + +#include "video/out/opengl/ra.h" +#include "common.h" + +struct vk_format { + const char *name; + VkFormat iformat; // vulkan format enum + int components; // how many components are there + int bytes; // how many bytes is a texel + int bits[4]; // how many bits per component + enum ra_ctype ctype; // format representation type + bool fucked_order; // used for formats which are not simply rgba +}; + +extern const struct vk_format vk_formats[]; diff --git a/video/out/vulkan/malloc.c b/video/out/vulkan/malloc.c new file mode 100644 index 0000000000000..cdab6eb590e63 --- /dev/null +++ b/video/out/vulkan/malloc.c @@ -0,0 +1,315 @@ +#include "malloc.h" +#include "utils.h" +#include "osdep/timer.h" + +// Controls how much more space we will allocate than actually necessary. +// Increasing this number increases the amount of memory used in total, but +// decreases the frequency at which slabs need to be allocated and freed. A +// value of 4 means the slabs will be allocated 4 times as large as they need +// to be. +#define MPVK_HEAP_SLAB_OVERCOMMIT 4 + +// Controls the minimum slab size, to avoid overusing small slabs when +// allocating many small slabs. (Default: 1 MB) +#define MPVK_HEAP_MINIMUM_SLAB_SIZE (1 << 20) + +// A single slab represents a contiguous region of allocated memory. Actual +// allocations are served as slices of this. Slabs are organized into linked +// lists, which represent individual heaps. +struct vk_slab { + struct vk_slab *next; // pointer to next vk_slab, or NULL + VkDeviceMemory mem; // underlying device allocation + VkDeviceSize size; // total size of `slab` + VkDeviceSize used; // number of bytes actually in use (for GC accounting) + VkDeviceSize index; // next free byte in `slab` + // optional, depends on the memory type: + VkBuffer buffer; // buffer spanning the entire slab + void *data; // mapped memory corresponding to `mem` +}; + +struct vk_heap { + VkBufferUsageFlagBits usage; // or 0 for generic heaps + struct vk_slab *tip; // linked list of slabs that form this heap +}; + +// Represents a single memory type. All allocations of this memory type are +// grouped together into heaps; one per buffer usage type and one for generic +// allocations (e.g. images). +struct vk_memtype { + int index; // the memory type index + int heapIndex; // the memory heap index + VkMemoryPropertyFlagBits flags; // the memory type bits + struct vk_heap generic_heap; // the heap for generic allocations + // An array of heaps for each possible buffer type (grows dynamically): + // This is an array of sub-allocations, so we can resize the buf_heaps + // array without breaking the vk_heap pointers in memslice.priv. + struct vk_heap **buf_heaps; + int num_buf_heaps; +}; + +// The overall state of the allocator, which keeps track of a vk_heap for each +// memory type supported by the device. +struct vk_malloc { + struct vk_memtype types[VK_MAX_MEMORY_TYPES]; + int num_types; +}; + +void vk_malloc_init(struct mpvk_ctx *vk) +{ + assert(vk->physd); + + struct vk_malloc *ma = vk->alloc = talloc_zero(NULL, struct vk_malloc); + + VkPhysicalDeviceMemoryProperties prop; + vkGetPhysicalDeviceMemoryProperties(vk->physd, &prop); + + ma->num_types = prop.memoryTypeCount; + for (int i = 0; i < prop.memoryTypeCount; i++) { + ma->types[i] = (struct vk_memtype) { + .index = i, + .heapIndex = prop.memoryTypes[i].heapIndex, + .flags = prop.memoryTypes[i].propertyFlags, + }; + } +} + +// "Unlinks" a slab. The slab_ptr is updated to the next link in the chain, +// or NULL if none left. +static void slab_free(struct mpvk_ctx *vk, struct vk_slab **slab_ptr) +{ + struct vk_slab *slab = *slab_ptr; + if (!slab) + return; + + assert(slab->used == 0); + + int64_t start = mp_time_us(); + vkDestroyBuffer(vk->dev, slab->buffer, MPVK_ALLOCATOR); + // also implicitly unmaps the memory if needed + vkFreeMemory(vk->dev, slab->mem, MPVK_ALLOCATOR); + int64_t stop = mp_time_us(); + + MP_VERBOSE(vk, "Freeing slab of size %lu took %ld μs.\n", + slab->size, stop - start); + + *slab_ptr = slab->next; + talloc_free(slab); +} + +static void heap_uninit(struct mpvk_ctx *vk, struct vk_heap *heap) +{ + while (heap->tip) + slab_free(vk, &heap->tip); +} + +void vk_malloc_uninit(struct mpvk_ctx *vk) +{ + struct vk_malloc *ma = vk->alloc; + if (!ma) + return; + + for (int i = 0; i < ma->num_types; i++) { + heap_uninit(vk, &ma->types[i].generic_heap); + for (int b = 0; b < ma->types[i].num_buf_heaps; b++) { + heap_uninit(vk, ma->types[i].buf_heaps[b]); + talloc_free(ma->types[i].buf_heaps[b]); + } + talloc_free(ma->types[i].buf_heaps); + } + + talloc_free(vk->alloc); +} + +// reqs: optional +static struct vk_memtype *find_best_memtype(struct mpvk_ctx *vk, + VkMemoryPropertyFlagBits flags, + VkMemoryRequirements *reqs) +{ + struct vk_malloc *ma = vk->alloc; + + // The vulkan spec requires memory types to be sorted in the "optimal" + // order, so the first matching type we find will be the best/fastest one. + for (int i = 0; i < ma->num_types; i++) { + // The memory type flags must include our properties + if ((ma->types[i].flags & flags) != flags) + continue; + // The memory type must be supported by the requirements (bitfield) + if (reqs && !(reqs->memoryTypeBits & (1 << i))) + continue; + + return &ma->types[i]; + } + + MP_ERR(vk, "Found no memory type matching property flags 0x%x!\n", flags); + return NULL; +} + +// Resizes a heap to make sure we have enough free bytes to serve an allocation +static bool resize_heap(struct mpvk_ctx *vk, struct vk_memtype *type, + struct vk_heap *heap, VkDeviceSize size, + VkDeviceSize align) +{ + // If the tip already exists and is large enough, we can return right away + if (heap->tip) { + if (MP_ALIGN_UP(heap->tip->index, align) + size <= heap->tip->size) + return true; + + // If the tip exists but is not large enough and has no other current + // allocations, free it right away to avoid accumulating garbage. + if (heap->tip->used == 0) + slab_free(vk, &heap->tip); + } + + // Otherwise, allocate a new vk_slab and prepend it to the linked list + struct vk_slab *slab = talloc_ptrtype(NULL, slab); + + VkDeviceSize minSize = MPMAX(MPVK_HEAP_MINIMUM_SLAB_SIZE, + MPVK_HEAP_SLAB_OVERCOMMIT * size); + *slab = (struct vk_slab) { + .next = heap->tip, + .size = heap->tip ? MPMAX(heap->tip->size, minSize) : minSize, + }; + + MP_VERBOSE(vk, "Allocating %lu memory of type 0x%x (id %d) in heap %d.\n", + slab->size, type->flags, type->index, type->heapIndex); + + VkMemoryAllocateInfo minfo = { + .sType = VK_STRUCTURE_TYPE_MEMORY_ALLOCATE_INFO, + .memoryTypeIndex = type->index, + .allocationSize = slab->size, + }; + + if (heap->usage) { + VkBufferCreateInfo binfo = { + .sType = VK_STRUCTURE_TYPE_BUFFER_CREATE_INFO, + .size = slab->size, + .usage = heap->usage, + .sharingMode = VK_SHARING_MODE_EXCLUSIVE, + }; + + VK(vkCreateBuffer(vk->dev, &binfo, MPVK_ALLOCATOR, &slab->buffer)); + + VkMemoryRequirements reqs; + vkGetBufferMemoryRequirements(vk->dev, slab->buffer, &reqs); + minfo.allocationSize = reqs.size; // this can be larger than slab->size + + // Sanity check the memory requirements to make sure we didn't screw up + if (!(reqs.memoryTypeBits & (1 << type->index))) { + MP_ERR(vk, "Chosen memory type %d does not support buffer usage " + "0x%x!\n", type->index, heap->usage); + goto error; + } + } + + VK(vkAllocateMemory(vk->dev, &minfo, MPVK_ALLOCATOR, &slab->mem)); + + if (type->flags & VK_MEMORY_PROPERTY_HOST_VISIBLE_BIT) + VK(vkMapMemory(vk->dev, slab->mem, 0, VK_WHOLE_SIZE, 0, &slab->data)); + + if (heap->usage) + VK(vkBindBufferMemory(vk->dev, slab->buffer, slab->mem, 0)); + + heap->tip = slab; + return true; + +error: + slab_free(vk, &slab); + return false; +} + +void vk_free_memslice(struct mpvk_ctx *vk, struct vk_memslice slice) +{ + struct vk_heap *heap = slice.priv; + + // Find the slab containing this allocation, while also keeping track + // of the pointer to it (so we can unlink it from the list if needed) + struct vk_slab **slab_ptr = &heap->tip; + struct vk_slab *slab = *slab_ptr; + while (slab) { + if (slab->mem == slice.vkmem) + break; + slab_ptr = &slab->next; + slab = *slab_ptr; + } + + assert(slab); + assert(slab->used >= slice.size); + slab->used -= slice.size; + + MP_DBG(vk, "Freeing slice %lu + %lu from slab with size %lu\n", + slice.offset, slice.size, slab->size); + + if (slab->used == 0 && slab != heap->tip) + slab_free(vk, slab_ptr); +} + +static bool slice_heap(struct mpvk_ctx *vk, struct vk_memtype *type, + struct vk_heap *heap, VkDeviceSize size, + VkDeviceSize alignment, struct vk_memslice *out) +{ + if (!resize_heap(vk, type, heap, size, alignment)) + return false; + + struct vk_slab *tip = heap->tip; + assert(tip); + *out = (struct vk_memslice) { + .vkmem = tip->mem, + .offset = MP_ALIGN_UP(tip->index, alignment), + .size = size, + .priv = heap, + }; + + MP_DBG(vk, "Sub-allocating slice %lu + %lu from slab with size %lu\n", + out->offset, out->size, tip->size); + + tip->index = out->offset + size; + tip->used += size; + return true; +} + +bool vk_malloc_generic(struct mpvk_ctx *vk, VkMemoryRequirements reqs, + VkMemoryPropertyFlagBits flags, struct vk_memslice *out) +{ + struct vk_memtype *type = find_best_memtype(vk, flags, &reqs); + if (!type) + return false; + + struct vk_heap *heap = &type->generic_heap; + return slice_heap(vk, type, heap, reqs.size, reqs.alignment, out); +} + +bool vk_malloc_buffer(struct mpvk_ctx *vk, VkBufferUsageFlagBits bufFlags, + VkMemoryPropertyFlagBits memFlags, VkDeviceSize size, + VkDeviceSize alignment, struct vk_bufslice *out) +{ + struct vk_memtype *type = find_best_memtype(vk, memFlags, NULL); + if (!type) + return false; + + struct vk_heap *heap = NULL; + for (int i = 0; i < type->num_buf_heaps; i++) { + if (type->buf_heaps[i]->usage == bufFlags) { + heap = type->buf_heaps[i]; + goto found; + } + } + + // no buffer heap with this type => add it + MP_TARRAY_GROW(NULL, type->buf_heaps, type->num_buf_heaps + 1); + heap = type->buf_heaps[type->num_buf_heaps++] = talloc_ptrtype(NULL, heap); + + *heap = (struct vk_heap) { + .usage = bufFlags, + }; + +found: + if (!slice_heap(vk, type, heap, size, alignment, &out->mem)) + return false; + + struct vk_slab *tip = heap->tip; + out->buf = tip->buffer; + if (tip->data) + out->data = (void *)((uintptr_t)tip->data + (ptrdiff_t)out->mem.offset); + + return true; +} diff --git a/video/out/vulkan/malloc.h b/video/out/vulkan/malloc.h new file mode 100644 index 0000000000000..1963950d54f3a --- /dev/null +++ b/video/out/vulkan/malloc.h @@ -0,0 +1,35 @@ +#pragma once + +#include "common.h" + +void vk_malloc_init(struct mpvk_ctx *vk); +void vk_malloc_uninit(struct mpvk_ctx *vk); + +// Represents a single "slice" of generic (non-buffer) memory, plus some +// metadata for accounting. This struct is essentially read-only. +struct vk_memslice { + VkDeviceMemory vkmem; + VkDeviceSize offset; + VkDeviceSize size; + void *priv; +}; + +void vk_free_memslice(struct mpvk_ctx *vk, struct vk_memslice slice); +bool vk_malloc_generic(struct mpvk_ctx *vk, VkMemoryRequirements reqs, + VkMemoryPropertyFlagBits flags, struct vk_memslice *out); + +// Represents a single "slice" of a larger buffer +struct vk_bufslice { + struct vk_memslice mem; // must be freed by the user when done + VkBuffer buf; // the buffer this memory was sliced from + // For persistently mapped buffers, this points to the first usable byte of + // this slice. + void *data; +}; + +// Allocate a buffer slice. This is more efficient than vk_malloc_generic for +// when the user needs lots of buffers, since it doesn't require +// creating/destroying lots of (little) VkBuffers. +bool vk_malloc_buffer(struct mpvk_ctx *vk, VkBufferUsageFlagBits bufFlags, + VkMemoryPropertyFlagBits memFlags, VkDeviceSize size, + VkDeviceSize alignment, struct vk_bufslice *out); diff --git a/video/out/vulkan/ra_vk.c b/video/out/vulkan/ra_vk.c new file mode 100644 index 0000000000000..853d868a861d6 --- /dev/null +++ b/video/out/vulkan/ra_vk.c @@ -0,0 +1,1588 @@ +#include "ra_vk.h" +#include "malloc.h" +#include "video/out/opengl/utils.h" + +// For ra.priv +struct ra_vk { + struct mpvk_ctx *vk; + struct ra_tex *clear_tex; // stupid hack for clear() + // "Currently recording" command buffer + struct vk_cmd *active_cmd; +}; + +static struct mpvk_ctx *vk_get(struct ra *ra) +{ + struct ra_vk *p = ra->priv; + return p->vk; +} + +static struct vk_cmd *vk_require_cmd(struct ra *ra) +{ + struct ra_vk *p = ra->priv; + struct mpvk_ctx *vk = vk_get(ra); + struct vk_cmdpool *pool = vk->pool; + + if (p->active_cmd) { + assert(p->active_cmd->pool == pool); + return p->active_cmd; + } + + struct vk_cmd *cmd = vk_cmd_begin(vk, pool); + return p->active_cmd = cmd; +} + +// Note: This technically follows the flush() API, but we don't need +// to expose that (and in fact, it's a bad idea) since we control flushing +// behavior with ra_vk_present_frame already. +static void vk_flush(struct ra *ra) +{ + struct ra_vk *p = ra->priv; + struct mpvk_ctx *vk = vk_get(ra); + + if (!p->active_cmd) + return; + + vk_cmd_submit(vk, p->active_cmd, NULL); + p->active_cmd = NULL; +} + +// the callback's *priv will always be set to `ra` +static void vk_callback(struct ra *ra, vk_cb callback, void *arg) +{ + struct ra_vk *p = ra->priv; + struct mpvk_ctx *vk = vk_get(ra); + + if (p->active_cmd) { + vk_cmd_callback(p->active_cmd, callback, ra, arg); + } else { + vk_dev_callback(vk, callback, ra, arg); + } +} + +#define MAKE_LAZY_DESTRUCTOR(fun, argtype) \ + static void fun##_lazy(struct ra *ra, argtype *arg) { \ + vk_callback(ra, (vk_cb) fun, arg); \ + } + +static void vk_destroy_ra(struct ra *ra) +{ + struct ra_vk *p = ra->priv; + struct mpvk_ctx *vk = vk_get(ra); + + vk_flush(ra); + mpvk_wait_idle(vk); + ra_tex_free(ra, &p->clear_tex); + + talloc_free(ra); +} + +static bool vk_setup_formats(struct ra *ra) +{ + struct mpvk_ctx *vk = vk_get(ra); + + for (const struct vk_format *vk_fmt = vk_formats; vk_fmt->name; vk_fmt++) { + VkFormatProperties prop; + vkGetPhysicalDeviceFormatProperties(vk->physd, vk_fmt->iformat, &prop); + + // As a bare minimum, we need to sample from an allocated image + VkFormatFeatureFlags flags = prop.optimalTilingFeatures; + if (!(flags & VK_FORMAT_FEATURE_SAMPLED_IMAGE_BIT)) + continue; + + VkFormatFeatureFlags linear_bits, render_bits; + linear_bits = VK_FORMAT_FEATURE_SAMPLED_IMAGE_FILTER_LINEAR_BIT; + render_bits = VK_FORMAT_FEATURE_COLOR_ATTACHMENT_BIT | + VK_FORMAT_FEATURE_COLOR_ATTACHMENT_BLEND_BIT; + + struct ra_format *fmt = talloc_zero(ra, struct ra_format); + *fmt = (struct ra_format) { + .name = vk_fmt->name, + .priv = (void *)vk_fmt, + .ctype = vk_fmt->ctype, + .ordered = !vk_fmt->fucked_order, + .num_components = vk_fmt->components, + .pixel_size = vk_fmt->bytes, + .linear_filter = !!(flags & linear_bits), + .renderable = !!(flags & render_bits), + }; + + for (int i = 0; i < 4; i++) + fmt->component_size[i] = fmt->component_depth[i] = vk_fmt->bits[i]; + + MP_TARRAY_APPEND(ra, ra->formats, ra->num_formats, fmt); + } + + // Populate some other capabilities related to formats while we're at it + VkImageType imgType[3] = { + VK_IMAGE_TYPE_1D, + VK_IMAGE_TYPE_2D, + VK_IMAGE_TYPE_3D + }; + + // R8_UNORM is supported on literally every single vulkan implementation + const VkFormat testfmt = VK_FORMAT_R8_UNORM; + + for (int d = 0; d < 3; d++) { + VkImageFormatProperties iprop; + VkResult res = vkGetPhysicalDeviceImageFormatProperties(vk->physd, + testfmt, imgType[d], VK_IMAGE_TILING_OPTIMAL, + VK_IMAGE_USAGE_SAMPLED_BIT, 0, &iprop); + + switch (imgType[d]) { + case VK_IMAGE_TYPE_1D: + if (res == VK_SUCCESS) + ra->caps |= RA_CAP_TEX_1D; + break; + case VK_IMAGE_TYPE_2D: + // 2D formats must be supported by RA, so ensure this is the case + VK_ASSERT(res, "Querying 2D format limits"); + ra->max_texture_wh = MPMIN(iprop.maxExtent.width, iprop.maxExtent.height); + break; + case VK_IMAGE_TYPE_3D: + if (res == VK_SUCCESS) + ra->caps |= RA_CAP_TEX_3D; + break; + } + } + + // RA_CAP_BLIT implies both blitting between images as well as blitting + // directly to the swapchain image, so check for all three operations + bool blittable = true; + VkFormatProperties prop; + vkGetPhysicalDeviceFormatProperties(vk->physd, testfmt, &prop); + if (!(prop.optimalTilingFeatures & VK_FORMAT_FEATURE_BLIT_SRC_BIT)) + blittable = false; + if (!(prop.optimalTilingFeatures & VK_FORMAT_FEATURE_BLIT_DST_BIT)) + blittable = false; + + vkGetPhysicalDeviceFormatProperties(vk->physd, vk->surf_format.format, &prop); + if (!(prop.optimalTilingFeatures & VK_FORMAT_FEATURE_BLIT_DST_BIT)) + blittable = false; + + if (blittable) + ra->caps |= RA_CAP_BLIT; + + return true; + +error: + return false; +} + +static struct ra_fns ra_fns_vk; + +struct ra *ra_create_vk(struct mpvk_ctx *vk, struct mp_log *log) +{ + assert(vk->dev); + assert(vk->alloc); + + struct ra *ra = talloc_zero(NULL, struct ra); + ra->log = log; + ra->fns = &ra_fns_vk; + + struct ra_vk *p = ra->priv = talloc_zero(ra, struct ra_vk); + p->vk = vk; + + // There's no way to query the supported GLSL version from VK_NV_glsl_shader + // (thanks nvidia), so just pick the GL version that modern nvidia devices + // support.. + ra->glsl_version = 450; + ra->glsl_vulkan = true; + ra->max_shmem = vk->limits.maxComputeSharedMemorySize; + ra->caps = RA_CAP_NESTED_ARRAY; + + if (vk->pool->props.queueFlags & VK_QUEUE_COMPUTE_BIT) + ra->caps |= RA_CAP_COMPUTE; + + if (!vk_setup_formats(ra)) + goto error; + + // UBO support is required + ra->caps |= RA_CAP_BUF_RO; + + // Try creating a shader storage buffer + struct ra_buf_params ssbo_params = { + .type = RA_BUF_TYPE_SHADER_STORAGE, + .size = 16, + }; + + struct ra_buf *ssbo = ra_buf_create(ra, &ssbo_params); + if (ssbo) { + ra->caps |= RA_CAP_BUF_RW; + ra_buf_free(ra, &ssbo); + } + + // To support clear() by region, we need to allocate a dummy 1x1 image that + // will be used as the source of blit operations + struct ra_tex_params clear_params = { + .dimensions = 1, // no point in using a 2D image if height = 1 + .w = 1, + .h = 1, + .d = 1, + .format = ra_find_float16_format(ra, 4), + .blit_src = 1, + .host_mutable = 1, + }; + + p->clear_tex = ra_tex_create(ra, &clear_params); + if (!p->clear_tex) { + MP_ERR(ra, "Failed creating 1x1 dummy texture for clear()!\n"); + goto error; + } + + return ra; + +error: + vk_destroy_ra(ra); + return NULL; +} + +// Boilerplate wrapper around vkCreateRenderPass to ensure passes remain +// compatible +static VkResult vk_create_render_pass(VkDevice dev, const struct ra_format *fmt, + bool load_fbo, VkRenderPass *out) +{ + struct vk_format *vk_fmt = fmt->priv; + assert(fmt->renderable); + + VkRenderPassCreateInfo rinfo = { + .sType = VK_STRUCTURE_TYPE_RENDER_PASS_CREATE_INFO, + .attachmentCount = 1, + .pAttachments = &(VkAttachmentDescription) { + .format = vk_fmt->iformat, + .samples = VK_SAMPLE_COUNT_1_BIT, + .loadOp = load_fbo ? VK_ATTACHMENT_LOAD_OP_LOAD + : VK_ATTACHMENT_LOAD_OP_DONT_CARE, + .storeOp = VK_ATTACHMENT_STORE_OP_STORE, + .initialLayout = VK_IMAGE_LAYOUT_COLOR_ATTACHMENT_OPTIMAL, + .finalLayout = VK_IMAGE_LAYOUT_COLOR_ATTACHMENT_OPTIMAL, + }, + .subpassCount = 1, + .pSubpasses = &(VkSubpassDescription) { + .pipelineBindPoint = VK_PIPELINE_BIND_POINT_GRAPHICS, + .colorAttachmentCount = 1, + .pColorAttachments = &(VkAttachmentReference) { + .attachment = 0, + .layout = VK_IMAGE_LAYOUT_COLOR_ATTACHMENT_OPTIMAL, + }, + }, + }; + + return vkCreateRenderPass(dev, &rinfo, MPVK_ALLOCATOR, out); +} + +// For ra_tex.priv +struct ra_tex_vk { + bool external_img; + VkImageType type; + VkImage img; + struct vk_memslice mem; + // for sampling + VkImageView view; + VkSampler sampler; + // for rendering + VkFramebuffer framebuffer; + VkRenderPass dummyPass; + // for uploading + struct ra_buf_pool pbo; + // "current" metadata, can change during the course of execution + VkImageLayout current_layout; + VkPipelineStageFlagBits current_stage; + VkAccessFlagBits current_access; +}; + +// Small helper to ease image barrier creation. if `discard` is set, the contents +// of the image will be undefined after the barrier +static void tex_barrier(struct vk_cmd *cmd, struct ra_tex_vk *tex_vk, + VkPipelineStageFlagBits newStage, + VkAccessFlagBits newAccess, VkImageLayout newLayout, + bool discard) +{ + VkImageMemoryBarrier imgBarrier = { + .sType = VK_STRUCTURE_TYPE_IMAGE_MEMORY_BARRIER, + .oldLayout = tex_vk->current_layout, + .newLayout = newLayout, + .srcQueueFamilyIndex = VK_QUEUE_FAMILY_IGNORED, + .dstQueueFamilyIndex = VK_QUEUE_FAMILY_IGNORED, + .srcAccessMask = tex_vk->current_access, + .dstAccessMask = newAccess, + .image = tex_vk->img, + .subresourceRange = vk_range, + }; + + if (discard) { + imgBarrier.oldLayout = VK_IMAGE_LAYOUT_UNDEFINED; + imgBarrier.srcAccessMask = 0; + } + + vkCmdPipelineBarrier(cmd->buf, tex_vk->current_stage, newStage, 0, + 0, NULL, 0, NULL, 1, &imgBarrier); + + tex_vk->current_stage = newStage; + tex_vk->current_layout = newLayout; + tex_vk->current_access = newAccess; +} + +static void vk_tex_destroy(struct ra *ra, struct ra_tex *tex) +{ + if (!tex) + return; + + struct mpvk_ctx *vk = vk_get(ra); + struct ra_tex_vk *tex_vk = tex->priv; + + ra_buf_pool_uninit(ra, &tex_vk->pbo); + vkDestroyFramebuffer(vk->dev, tex_vk->framebuffer, MPVK_ALLOCATOR); + vkDestroyRenderPass(vk->dev, tex_vk->dummyPass, MPVK_ALLOCATOR); + vkDestroySampler(vk->dev, tex_vk->sampler, MPVK_ALLOCATOR); + vkDestroyImageView(vk->dev, tex_vk->view, MPVK_ALLOCATOR); + if (!tex_vk->external_img) { + vkDestroyImage(vk->dev, tex_vk->img, MPVK_ALLOCATOR); + vk_free_memslice(vk, tex_vk->mem); + } + + talloc_free(tex); +} + +MAKE_LAZY_DESTRUCTOR(vk_tex_destroy, struct ra_tex); + +// Initializes non-VkImage values like the image view, samplers, etc. +static bool vk_init_image(struct ra *ra, struct ra_tex *tex) +{ + struct mpvk_ctx *vk = vk_get(ra); + + struct ra_tex_params *params = &tex->params; + struct ra_tex_vk *tex_vk = tex->priv; + assert(tex_vk->img); + + tex_vk->current_layout = VK_IMAGE_LAYOUT_UNDEFINED; + tex_vk->current_stage = VK_PIPELINE_STAGE_TOP_OF_PIPE_BIT; + tex_vk->current_access = 0; + + if (params->render_src || params->render_dst) { + static const VkImageViewType viewType[] = { + [VK_IMAGE_TYPE_1D] = VK_IMAGE_VIEW_TYPE_1D, + [VK_IMAGE_TYPE_2D] = VK_IMAGE_VIEW_TYPE_2D, + [VK_IMAGE_TYPE_3D] = VK_IMAGE_VIEW_TYPE_3D, + }; + + const struct vk_format *fmt = params->format->priv; + VkImageViewCreateInfo vinfo = { + .sType = VK_STRUCTURE_TYPE_IMAGE_VIEW_CREATE_INFO, + .image = tex_vk->img, + .viewType = viewType[tex_vk->type], + .format = fmt->iformat, + .subresourceRange = vk_range, + }; + + VK(vkCreateImageView(vk->dev, &vinfo, MPVK_ALLOCATOR, &tex_vk->view)); + } + + if (params->render_src) { + assert(params->format->linear_filter || !params->src_linear); + VkFilter filter = params->src_linear + ? VK_FILTER_LINEAR + : VK_FILTER_NEAREST; + VkSamplerAddressMode wrap = params->src_repeat + ? VK_SAMPLER_ADDRESS_MODE_REPEAT + : VK_SAMPLER_ADDRESS_MODE_CLAMP_TO_EDGE; + VkSamplerCreateInfo sinfo = { + .sType = VK_STRUCTURE_TYPE_SAMPLER_CREATE_INFO, + .magFilter = filter, + .minFilter = filter, + .addressModeU = wrap, + .addressModeV = wrap, + .addressModeW = wrap, + .maxAnisotropy = 1.0, + }; + + VK(vkCreateSampler(vk->dev, &sinfo, MPVK_ALLOCATOR, &tex_vk->sampler)); + } + + if (params->render_dst) { + // Framebuffers need to be created against a specific render pass + // layout, so we need to temporarily create a skeleton/dummy render + // pass for vulkan to figure out the compatibility + VK(vk_create_render_pass(vk->dev, params->format, false, &tex_vk->dummyPass)); + + VkFramebufferCreateInfo finfo = { + .sType = VK_STRUCTURE_TYPE_FRAMEBUFFER_CREATE_INFO, + .renderPass = tex_vk->dummyPass, + .attachmentCount = 1, + .pAttachments = &tex_vk->view, + .width = tex->params.w, + .height = tex->params.h, + .layers = 1, + }; + + VK(vkCreateFramebuffer(vk->dev, &finfo, MPVK_ALLOCATOR, + &tex_vk->framebuffer)); + + // NOTE: Normally we would free the dummyPass again here, but a bug + // in the nvidia vulkan driver causes a segfault if you do. + } + + return true; + +error: + return false; +} + +static struct ra_tex *vk_tex_create(struct ra *ra, + const struct ra_tex_params *params) +{ + struct mpvk_ctx *vk = vk_get(ra); + + struct ra_tex *tex = talloc_zero(NULL, struct ra_tex); + tex->params = *params; + tex->params.initial_data = NULL; + + struct ra_tex_vk *tex_vk = tex->priv = talloc_zero(tex, struct ra_tex_vk); + + const struct vk_format *fmt = params->format->priv; + switch (params->dimensions) { + case 1: tex_vk->type = VK_IMAGE_TYPE_1D; break; + case 2: tex_vk->type = VK_IMAGE_TYPE_2D; break; + case 3: tex_vk->type = VK_IMAGE_TYPE_3D; break; + default: abort(); + } + + VkImageUsageFlags usage = 0; + if (params->render_src) + usage |= VK_IMAGE_USAGE_SAMPLED_BIT; + if (params->render_dst) + usage |= VK_IMAGE_USAGE_COLOR_ATTACHMENT_BIT; + if (params->storage_dst) + usage |= VK_IMAGE_USAGE_STORAGE_BIT; + if (params->blit_src) + usage |= VK_IMAGE_USAGE_TRANSFER_SRC_BIT; + if (params->host_mutable || params->blit_dst || params->initial_data) + usage |= VK_IMAGE_USAGE_TRANSFER_DST_BIT; + + // Double-check image usage support and fail immediately if invalid + VkImageFormatProperties iprop; + VkResult res = vkGetPhysicalDeviceImageFormatProperties(vk->physd, + fmt->iformat, tex_vk->type, VK_IMAGE_TILING_OPTIMAL, usage, 0, + &iprop); + if (res == VK_ERROR_FORMAT_NOT_SUPPORTED) { + return NULL; + } else { + VK_ASSERT(res, "Querying image format properties"); + } + + VkFormatProperties prop; + vkGetPhysicalDeviceFormatProperties(vk->physd, fmt->iformat, &prop); + VkFormatFeatureFlags flags = prop.optimalTilingFeatures; + + bool has_blit_src = flags & VK_FORMAT_FEATURE_BLIT_SRC_BIT, + has_src_linear = flags & VK_FORMAT_FEATURE_SAMPLED_IMAGE_FILTER_LINEAR_BIT; + + if (params->w > iprop.maxExtent.width || + params->h > iprop.maxExtent.height || + params->d > iprop.maxExtent.depth || + (params->blit_src && !has_blit_src) || + (params->src_linear && !has_src_linear)) + { + return NULL; + } + + VkImageCreateInfo iinfo = { + .sType = VK_STRUCTURE_TYPE_IMAGE_CREATE_INFO, + .imageType = tex_vk->type, + .format = fmt->iformat, + .extent = (VkExtent3D) { params->w, params->h, params->d }, + .mipLevels = 1, + .arrayLayers = 1, + .samples = VK_SAMPLE_COUNT_1_BIT, + .tiling = VK_IMAGE_TILING_OPTIMAL, + .usage = usage, + .sharingMode = VK_SHARING_MODE_EXCLUSIVE, + .initialLayout = VK_IMAGE_LAYOUT_UNDEFINED, + }; + + VK(vkCreateImage(vk->dev, &iinfo, MPVK_ALLOCATOR, &tex_vk->img)); + + VkMemoryPropertyFlagBits memFlags = VK_MEMORY_PROPERTY_DEVICE_LOCAL_BIT; + VkMemoryRequirements reqs; + vkGetImageMemoryRequirements(vk->dev, tex_vk->img, &reqs); + + struct vk_memslice *mem = &tex_vk->mem; + if (!vk_malloc_generic(vk, reqs, memFlags, mem)) + goto error; + + VK(vkBindImageMemory(vk->dev, tex_vk->img, mem->vkmem, mem->offset)); + + if (!vk_init_image(ra, tex)) + goto error; + + if (params->initial_data) { + struct ra_tex_upload_params ul_params = { + .tex = tex, + .invalidate = true, + .src = params->initial_data, + .stride = params->w * fmt->bytes, + }; + if (!ra->fns->tex_upload(ra, &ul_params)) + goto error; + } + + return tex; + +error: + vk_tex_destroy(ra, tex); + return NULL; +} + +struct ra_tex *ra_vk_wrap_swchain_img(struct ra *ra, VkImage vkimg, + VkSwapchainCreateInfoKHR info) +{ + struct mpvk_ctx *vk = vk_get(ra); + struct ra_tex *tex = NULL; + + const struct ra_format *format = NULL; + for (int i = 0; i < ra->num_formats; i++) { + const struct vk_format *fmt = ra->formats[i]->priv; + if (fmt->iformat == vk->surf_format.format) { + format = ra->formats[i]; + break; + } + } + + if (!format) { + MP_ERR(ra, "Could not find ra_format suitable for wrapped swchain image " + "with surface format %d\n", vk->surf_format.format); + goto error; + } + + tex = talloc_zero(NULL, struct ra_tex); + tex->params = (struct ra_tex_params) { + .format = format, + .dimensions = 2, + .w = info.imageExtent.width, + .h = info.imageExtent.height, + .d = 1, + .blit_src = !!(info.imageUsage & VK_IMAGE_USAGE_TRANSFER_SRC_BIT), + .blit_dst = !!(info.imageUsage & VK_IMAGE_USAGE_TRANSFER_DST_BIT), + .render_src = !!(info.imageUsage & VK_IMAGE_USAGE_SAMPLED_BIT), + .render_dst = !!(info.imageUsage & VK_IMAGE_USAGE_COLOR_ATTACHMENT_BIT), + }; + + struct ra_tex_vk *tex_vk = tex->priv = talloc_zero(tex, struct ra_tex_vk); + tex_vk->type = VK_IMAGE_TYPE_2D; + tex_vk->external_img = true; + tex_vk->img = vkimg; + + if (!vk_init_image(ra, tex)) + goto error; + + return tex; + +error: + vk_tex_destroy(ra, tex); + return NULL; +} + +// For ra_buf.priv +struct ra_buf_vk { + struct vk_bufslice slice; + bool inuse; + bool needsflush; + // "current" metadata, can change during course of execution + VkPipelineStageFlagBits current_stage; + VkAccessFlagBits current_access; +}; + +static void buf_free_to_use(void *priv, struct ra_buf_vk *buf_vk) +{ + buf_vk->inuse = false; +} + +static void buf_barrier(struct vk_cmd *cmd, struct ra_buf *buf, + VkPipelineStageFlagBits newStage, + VkAccessFlagBits newAccess, int offset, size_t size) +{ + struct ra_buf_vk *buf_vk = buf->priv; + + VkBufferMemoryBarrier buffBarrier = { + .sType = VK_STRUCTURE_TYPE_BUFFER_MEMORY_BARRIER, + .srcAccessMask = buf_vk->current_access, + .dstAccessMask = newAccess, + .buffer = buf_vk->slice.buf, + .offset = offset, + .size = size, + }; + + if (buf_vk->needsflush || buf->params.host_mapped) { + buffBarrier.srcAccessMask = VK_ACCESS_HOST_WRITE_BIT; + buf_vk->current_stage = VK_PIPELINE_STAGE_HOST_BIT; + buf_vk->needsflush = false; + } + + vkCmdPipelineBarrier(cmd->buf, buf_vk->current_stage, newStage, 0, + 0, NULL, 1, &buffBarrier, 0, NULL); + + buf_vk->current_stage = newStage; + buf_vk->current_access = newAccess; + buf_vk->inuse = true; + + vk_cmd_callback(cmd, (vk_cb) buf_free_to_use, NULL, buf_vk); +} + +static void vk_buf_destroy(struct ra *ra, struct ra_buf *buf) +{ + if (!buf) + return; + + struct mpvk_ctx *vk = vk_get(ra); + struct ra_buf_vk *buf_vk = buf->priv; + + if (buf_vk->slice.buf) + vk_free_memslice(vk, buf_vk->slice.mem); + + talloc_free(buf); +} + +MAKE_LAZY_DESTRUCTOR(vk_buf_destroy, struct ra_buf); + +static void vk_buf_update(struct ra *ra, struct ra_buf *buf, ptrdiff_t offset, + const void *data, size_t size) +{ + assert(buf->params.host_mutable || buf->params.initial_data); + struct ra_buf_vk *buf_vk = buf->priv; + + // For host-mapped buffers, we can just directly memcpy the buffer contents. + // Otherwise, we can update the buffer from the GPU using a command buffer. + if (buf_vk->slice.data) { + assert(offset + size <= buf->params.size); + uintptr_t addr = (uintptr_t)buf_vk->slice.data + offset; + memcpy((void *)addr, data, size); + buf_vk->needsflush = true; + } else { + struct vk_cmd *cmd = vk_require_cmd(ra); + if (!cmd) { + MP_ERR(ra, "Failed updating buffer!\n"); + return; + } + + VkDeviceSize bufOffset = buf_vk->slice.mem.offset + offset; + assert(bufOffset == MP_ALIGN_UP(bufOffset, 4)); + vkCmdUpdateBuffer(cmd->buf, buf_vk->slice.buf, bufOffset, size, data); + } +} + +static struct ra_buf *vk_buf_create(struct ra *ra, + const struct ra_buf_params *params) +{ + struct mpvk_ctx *vk = vk_get(ra); + + struct ra_buf *buf = talloc_zero(NULL, struct ra_buf); + buf->params = *params; + + struct ra_buf_vk *buf_vk = buf->priv = talloc_zero(buf, struct ra_buf_vk); + buf_vk->current_stage = VK_PIPELINE_STAGE_TOP_OF_PIPE_BIT; + buf_vk->current_access = 0; + + VkBufferUsageFlagBits bufFlags = 0; + VkMemoryPropertyFlagBits memFlags = 0; + VkDeviceSize align = 4; // alignment 4 is needed for buf_update + + switch (params->type) { + case RA_BUF_TYPE_TEX_UPLOAD: + bufFlags |= VK_BUFFER_USAGE_TRANSFER_SRC_BIT; + memFlags |= VK_MEMORY_PROPERTY_HOST_VISIBLE_BIT; + break; + case RA_BUF_TYPE_UNIFORM: + bufFlags |= VK_BUFFER_USAGE_UNIFORM_BUFFER_BIT; + memFlags |= VK_MEMORY_PROPERTY_DEVICE_LOCAL_BIT; + align = MP_ALIGN_UP(align, vk->limits.minUniformBufferOffsetAlignment); + break; + case RA_BUF_TYPE_SHADER_STORAGE: + bufFlags |= VK_BUFFER_USAGE_STORAGE_BUFFER_BIT; + memFlags |= VK_MEMORY_PROPERTY_DEVICE_LOCAL_BIT; + align = MP_ALIGN_UP(align, vk->limits.minStorageBufferOffsetAlignment); + break; + case RA_BUF_TYPE_VERTEX: + bufFlags |= VK_BUFFER_USAGE_VERTEX_BUFFER_BIT; + memFlags |= VK_MEMORY_PROPERTY_DEVICE_LOCAL_BIT; + break; + default: abort(); + } + + if (params->host_mutable || params->initial_data) { + bufFlags |= VK_BUFFER_USAGE_TRANSFER_DST_BIT; + align = MP_ALIGN_UP(align, vk->limits.optimalBufferCopyOffsetAlignment); + } + + if (params->host_mapped) { + memFlags |= VK_MEMORY_PROPERTY_HOST_VISIBLE_BIT | + VK_MEMORY_PROPERTY_HOST_COHERENT_BIT | + VK_MEMORY_PROPERTY_HOST_CACHED_BIT; + } + + if (!vk_malloc_buffer(vk, bufFlags, memFlags, params->size, align, + &buf_vk->slice)) + { + goto error; + } + + if (params->host_mapped) + buf->data = buf_vk->slice.data; + + if (params->initial_data) + vk_buf_update(ra, buf, 0, params->initial_data, params->size); + + buf->params.initial_data = NULL; // do this after vk_buf_update + return buf; + +error: + vk_buf_destroy(ra, buf); + return NULL; +} + +static bool vk_buf_poll(struct ra *ra, struct ra_buf *buf) +{ + struct ra_buf_vk *buf_vk = buf->priv; + return !buf_vk->inuse; +} + +static bool vk_tex_upload(struct ra *ra, + const struct ra_tex_upload_params *params) +{ + + struct ra_tex *tex = params->tex; + struct ra_tex_vk *tex_vk = tex->priv; + + if (!params->buf) + return ra_tex_upload_pbo(ra, &tex_vk->pbo, params); + + assert(!params->src); + assert(params->buf); + struct ra_buf *buf = params->buf; + struct ra_buf_vk *buf_vk = buf->priv; + + VkBufferImageCopy region = { + .bufferOffset = buf_vk->slice.mem.offset + params->buf_offset, + .bufferRowLength = tex->params.w, + .bufferImageHeight = tex->params.h, + .imageSubresource = vk_layers, + .imageExtent = (VkExtent3D){tex->params.w, tex->params.h, tex->params.d}, + }; + + if (tex->params.dimensions == 2) { + int pix_size = tex->params.format->pixel_size; + region.bufferRowLength = params->stride / pix_size; + if (region.bufferRowLength * pix_size != params->stride) { + MP_ERR(ra, "Texture upload strides must be a multiple of the texel " + "size!\n"); + goto error; + } + + if (params->rc) { + struct mp_rect *rc = params->rc; + region.imageOffset = (VkOffset3D){rc->x0, rc->y0, 0}; + region.imageExtent = (VkExtent3D){mp_rect_w(*rc), mp_rect_h(*rc), 1}; + } + } + + uint64_t size = region.bufferRowLength * region.bufferImageHeight * + region.imageExtent.depth; + + struct vk_cmd *cmd = vk_require_cmd(ra); + if (!cmd) + goto error; + + buf_barrier(cmd, buf, VK_PIPELINE_STAGE_TRANSFER_BIT, + VK_ACCESS_TRANSFER_READ_BIT, region.bufferOffset, size); + + tex_barrier(cmd, tex_vk, VK_PIPELINE_STAGE_TRANSFER_BIT, + VK_ACCESS_TRANSFER_WRITE_BIT, + VK_IMAGE_LAYOUT_TRANSFER_DST_OPTIMAL, + params->invalidate); + + vkCmdCopyBufferToImage(cmd->buf, buf_vk->slice.buf, tex_vk->img, + tex_vk->current_layout, 1, ®ion); + + return true; + +error: + return false; +} + +#define MPVK_NUM_DS MPVK_MAX_STREAMING_DEPTH + +// For ra_renderpass.priv +struct ra_renderpass_vk { + // Compiled shaders + VkShaderModule vert; + VkShaderModule frag; + VkShaderModule comp; + // Pipeline / render pass + VkPipeline pipe; + VkPipelineLayout pipeLayout; + VkPipelineCache pipeCache; + VkRenderPass renderPass; + // Descriptor set (bindings) + VkDescriptorSetLayout dsLayout; + VkDescriptorPool dsPool; + VkDescriptorSet dss[MPVK_NUM_DS]; + int dindex; + // Vertex buffers (vertices) + struct ra_buf_pool vbo; + + // For updating + VkWriteDescriptorSet *dswrite; + VkDescriptorImageInfo *dsiinfo; + VkDescriptorBufferInfo *dsbinfo; +}; + +static void vk_renderpass_destroy(struct ra *ra, struct ra_renderpass *pass) +{ + if (!pass) + return; + + struct mpvk_ctx *vk = vk_get(ra); + struct ra_renderpass_vk *pass_vk = pass->priv; + + ra_buf_pool_uninit(ra, &pass_vk->vbo); + vkDestroyPipeline(vk->dev, pass_vk->pipe, MPVK_ALLOCATOR); + vkDestroyPipelineCache(vk->dev, pass_vk->pipeCache, MPVK_ALLOCATOR); + vkDestroyRenderPass(vk->dev, pass_vk->renderPass, MPVK_ALLOCATOR); + vkDestroyPipelineLayout(vk->dev, pass_vk->pipeLayout, MPVK_ALLOCATOR); + vkDestroyDescriptorPool(vk->dev, pass_vk->dsPool, MPVK_ALLOCATOR); + vkDestroyDescriptorSetLayout(vk->dev, pass_vk->dsLayout, MPVK_ALLOCATOR); + vkDestroyShaderModule(vk->dev, pass_vk->vert, MPVK_ALLOCATOR); + vkDestroyShaderModule(vk->dev, pass_vk->frag, MPVK_ALLOCATOR); + vkDestroyShaderModule(vk->dev, pass_vk->comp, MPVK_ALLOCATOR); + + talloc_free(pass); +} + +MAKE_LAZY_DESTRUCTOR(vk_renderpass_destroy, struct ra_renderpass); + +static const VkDescriptorType dsType[] = { + [RA_VARTYPE_TEX] = VK_DESCRIPTOR_TYPE_COMBINED_IMAGE_SAMPLER, + [RA_VARTYPE_IMG_W] = VK_DESCRIPTOR_TYPE_STORAGE_IMAGE, + [RA_VARTYPE_BUF_RO] = VK_DESCRIPTOR_TYPE_UNIFORM_BUFFER, + [RA_VARTYPE_BUF_RW] = VK_DESCRIPTOR_TYPE_STORAGE_BUFFER, +}; + +static bool vk_get_input_format(struct ra *ra, struct ra_renderpass_input *inp, + VkFormat *out_fmt) +{ + struct mpvk_ctx *vk = vk_get(ra); + + enum ra_ctype ctype; + switch (inp->type) { + case RA_VARTYPE_FLOAT: ctype = RA_CTYPE_FLOAT; break; + case RA_VARTYPE_BYTE_UNORM: ctype = RA_CTYPE_UNORM; break; + default: abort(); + } + + assert(inp->dim_m == 1); + for (const struct vk_format *fmt = vk_formats; fmt->name; fmt++) { + if (fmt->ctype != ctype) + continue; + if (fmt->components != inp->dim_v) + continue; + if (fmt->bytes != ra_renderpass_input_layout(inp).size) + continue; + + // Ensure this format is valid for vertex attributes + VkFormatProperties prop; + vkGetPhysicalDeviceFormatProperties(vk->physd, fmt->iformat, &prop); + if (!(prop.bufferFeatures & VK_FORMAT_FEATURE_VERTEX_BUFFER_BIT)) + continue; + + *out_fmt = fmt->iformat; + return true; + } + + return false; +} + +static const VkPipelineStageFlagBits stageFlags[] = { + [RA_RENDERPASS_TYPE_RASTER] = VK_SHADER_STAGE_FRAGMENT_BIT, + [RA_RENDERPASS_TYPE_COMPUTE] = VK_SHADER_STAGE_COMPUTE_BIT, +}; + +static struct ra_renderpass *vk_renderpass_create(struct ra *ra, + const struct ra_renderpass_params *params) +{ + struct mpvk_ctx *vk = vk_get(ra); + + struct ra_renderpass *pass = talloc_zero(NULL, struct ra_renderpass); + pass->params = *ra_renderpass_params_copy(pass, params); + pass->params.cached_program = (bstr){0}; + struct ra_renderpass_vk *pass_vk = pass->priv = + talloc_zero(pass, struct ra_renderpass_vk); + + static int dsCount[RA_VARTYPE_COUNT] = {0}; + VkDescriptorSetLayoutBinding *bindings = NULL; + int num_bindings = 0; + + for (int i = 0; i < params->num_inputs; i++) { + struct ra_renderpass_input *inp = ¶ms->inputs[i]; + switch (inp->type) { + case RA_VARTYPE_TEX: + case RA_VARTYPE_IMG_W: + case RA_VARTYPE_BUF_RO: + case RA_VARTYPE_BUF_RW: { + VkDescriptorSetLayoutBinding desc = { + .binding = inp->binding, + .descriptorType = dsType[inp->type], + .descriptorCount = 1, + .stageFlags = stageFlags[params->type], + }; + + MP_TARRAY_APPEND(pass, bindings, num_bindings, desc); + dsCount[inp->type]++; + break; + } + default: abort(); + } + } + + VkDescriptorPoolSize *dsPoolSizes = NULL; + int poolSizeCount = 0; + for (enum ra_vartype t = 0; t < RA_VARTYPE_COUNT; t++) { + if (dsCount[t] > 0) { + VkDescriptorPoolSize dssize = { + .type = dsType[t], + .descriptorCount = dsCount[t] * MPVK_NUM_DS, + }; + + MP_TARRAY_APPEND(pass, dsPoolSizes, poolSizeCount, dssize); + } + } + + VkDescriptorPoolCreateInfo pinfo = { + .sType = VK_STRUCTURE_TYPE_DESCRIPTOR_POOL_CREATE_INFO, + .maxSets = MPVK_NUM_DS, + .pPoolSizes = dsPoolSizes, + .poolSizeCount = poolSizeCount, + }; + + VK(vkCreateDescriptorPool(vk->dev, &pinfo, MPVK_ALLOCATOR, &pass_vk->dsPool)); + talloc_free(dsPoolSizes); + + pass_vk->dswrite = talloc_array(pass, VkWriteDescriptorSet, num_bindings); + pass_vk->dsiinfo = talloc_array(pass, VkDescriptorImageInfo, num_bindings); + pass_vk->dsbinfo = talloc_array(pass, VkDescriptorBufferInfo, num_bindings); + + VkDescriptorSetLayoutCreateInfo dinfo = { + .sType = VK_STRUCTURE_TYPE_DESCRIPTOR_SET_LAYOUT_CREATE_INFO, + .pBindings = bindings, + .bindingCount = num_bindings, + }; + + VK(vkCreateDescriptorSetLayout(vk->dev, &dinfo, MPVK_ALLOCATOR, + &pass_vk->dsLayout)); + + VkDescriptorSetAllocateInfo ainfo = { + .sType = VK_STRUCTURE_TYPE_DESCRIPTOR_SET_ALLOCATE_INFO, + .descriptorPool = pass_vk->dsPool, + .descriptorSetCount = 1, + .pSetLayouts = &pass_vk->dsLayout, + }; + + for (int i = 0; i < MPVK_NUM_DS; i++) + VK(vkAllocateDescriptorSets(vk->dev, &ainfo, &pass_vk->dss[i])); + + VkPipelineLayoutCreateInfo linfo = { + .sType = VK_STRUCTURE_TYPE_PIPELINE_LAYOUT_CREATE_INFO, + .setLayoutCount = 1, + .pSetLayouts = &pass_vk->dsLayout, + }; + + VK(vkCreatePipelineLayout(vk->dev, &linfo, MPVK_ALLOCATOR, + &pass_vk->pipeLayout)); + + VkPipelineCacheCreateInfo pcinfo = { + .sType = VK_STRUCTURE_TYPE_PIPELINE_CACHE_CREATE_INFO, + .pInitialData = params->cached_program.start, + .initialDataSize = params->cached_program.len, + }; + + VK(vkCreatePipelineCache(vk->dev, &pcinfo, MPVK_ALLOCATOR, &pass_vk->pipeCache)); + + VkShaderModuleCreateInfo sinfo = { + .sType = VK_STRUCTURE_TYPE_SHADER_MODULE_CREATE_INFO, + }; + + switch (params->type) { + case RA_RENDERPASS_TYPE_RASTER: { + sinfo.pCode = (uint32_t *)params->vertex_shader; + sinfo.codeSize = strlen(params->vertex_shader); + VK(vkCreateShaderModule(vk->dev, &sinfo, MPVK_ALLOCATOR, &pass_vk->vert)); + + sinfo.pCode = (uint32_t *)params->frag_shader; + sinfo.codeSize = strlen(params->frag_shader); + VK(vkCreateShaderModule(vk->dev, &sinfo, MPVK_ALLOCATOR, &pass_vk->frag)); + + VK(vk_create_render_pass(vk->dev, params->target_format, + params->enable_blend, &pass_vk->renderPass)); + + VkPipelineShaderStageCreateInfo stages[] = { + { + .sType = VK_STRUCTURE_TYPE_PIPELINE_SHADER_STAGE_CREATE_INFO, + .stage = VK_SHADER_STAGE_VERTEX_BIT, + .module = pass_vk->vert, + .pName = "main", + }, + { + .sType = VK_STRUCTURE_TYPE_PIPELINE_SHADER_STAGE_CREATE_INFO, + .stage = VK_SHADER_STAGE_FRAGMENT_BIT, + .module = pass_vk->frag, + .pName = "main", + } + }; + + VkVertexInputAttributeDescription *attrs = talloc_array(pass, + VkVertexInputAttributeDescription, params->num_vertex_attribs); + + for (int i = 0; i < params->num_vertex_attribs; i++) { + struct ra_renderpass_input *inp = ¶ms->vertex_attribs[i]; + attrs[i] = (VkVertexInputAttributeDescription) { + .location = i, + .binding = 0, + .offset = inp->offset, + }; + + if (!vk_get_input_format(ra, inp, &attrs[i].format)) { + MP_ERR(ra, "No suitable VkFormat for vertex attrib '%s'!\n", + inp->name); + goto error; + } + } + + static const VkBlendFactor blendFactors[] = { + [RA_BLEND_ZERO] = VK_BLEND_FACTOR_ZERO, + [RA_BLEND_ONE] = VK_BLEND_FACTOR_ONE, + [RA_BLEND_SRC_ALPHA] = VK_BLEND_FACTOR_SRC_ALPHA, + [RA_BLEND_ONE_MINUS_SRC_ALPHA] = VK_BLEND_FACTOR_ONE_MINUS_SRC_ALPHA, + }; + + VkPipelineColorBlendAttachmentState binfo = { + .blendEnable = params->enable_blend, + .colorBlendOp = VK_BLEND_OP_ADD, + .srcColorBlendFactor = blendFactors[params->blend_src_rgb], + .dstColorBlendFactor = blendFactors[params->blend_dst_rgb], + .alphaBlendOp = VK_BLEND_OP_ADD, + .srcAlphaBlendFactor = blendFactors[params->blend_src_alpha], + .dstAlphaBlendFactor = blendFactors[params->blend_dst_alpha], + .colorWriteMask = VK_COLOR_COMPONENT_R_BIT | + VK_COLOR_COMPONENT_G_BIT | + VK_COLOR_COMPONENT_B_BIT | + VK_COLOR_COMPONENT_A_BIT, + }; + + VkGraphicsPipelineCreateInfo cinfo = { + .sType = VK_STRUCTURE_TYPE_GRAPHICS_PIPELINE_CREATE_INFO, + .stageCount = MP_ARRAY_SIZE(stages), + .pStages = &stages[0], + .pVertexInputState = &(VkPipelineVertexInputStateCreateInfo) { + .sType = VK_STRUCTURE_TYPE_PIPELINE_VERTEX_INPUT_STATE_CREATE_INFO, + .vertexBindingDescriptionCount = 1, + .pVertexBindingDescriptions = &(VkVertexInputBindingDescription) { + .binding = 0, + .stride = params->vertex_stride, + .inputRate = VK_VERTEX_INPUT_RATE_VERTEX, + }, + .vertexAttributeDescriptionCount = params->num_vertex_attribs, + .pVertexAttributeDescriptions = attrs, + }, + .pInputAssemblyState = &(VkPipelineInputAssemblyStateCreateInfo) { + .sType = VK_STRUCTURE_TYPE_PIPELINE_INPUT_ASSEMBLY_STATE_CREATE_INFO, + .topology = VK_PRIMITIVE_TOPOLOGY_TRIANGLE_LIST, + }, + .pViewportState = &(VkPipelineViewportStateCreateInfo) { + .sType = VK_STRUCTURE_TYPE_PIPELINE_VIEWPORT_STATE_CREATE_INFO, + .viewportCount = 1, + .scissorCount = 1, + }, + .pRasterizationState = &(VkPipelineRasterizationStateCreateInfo) { + .sType = VK_STRUCTURE_TYPE_PIPELINE_RASTERIZATION_STATE_CREATE_INFO, + .polygonMode = VK_POLYGON_MODE_FILL, + .cullMode = VK_CULL_MODE_NONE, + .lineWidth = 1.0f, + }, + .pMultisampleState = &(VkPipelineMultisampleStateCreateInfo) { + .sType = VK_STRUCTURE_TYPE_PIPELINE_MULTISAMPLE_STATE_CREATE_INFO, + .rasterizationSamples = VK_SAMPLE_COUNT_1_BIT, + }, + .pColorBlendState = &(VkPipelineColorBlendStateCreateInfo) { + .sType = VK_STRUCTURE_TYPE_PIPELINE_COLOR_BLEND_STATE_CREATE_INFO, + .attachmentCount = 1, + .pAttachments = &binfo, + }, + .pDynamicState = &(VkPipelineDynamicStateCreateInfo) { + .sType = VK_STRUCTURE_TYPE_PIPELINE_DYNAMIC_STATE_CREATE_INFO, + .dynamicStateCount = 2, + .pDynamicStates = (VkDynamicState[]){ + VK_DYNAMIC_STATE_VIEWPORT, + VK_DYNAMIC_STATE_SCISSOR, + }, + }, + .layout = pass_vk->pipeLayout, + .renderPass = pass_vk->renderPass, + }; + + VK(vkCreateGraphicsPipelines(vk->dev, pass_vk->pipeCache, 1, &cinfo, + MPVK_ALLOCATOR, &pass_vk->pipe)); + break; + } + case RA_RENDERPASS_TYPE_COMPUTE: { + sinfo.pCode = (uint32_t *)params->compute_shader; + sinfo.codeSize = strlen(params->compute_shader); + VK(vkCreateShaderModule(vk->dev, &sinfo, MPVK_ALLOCATOR, &pass_vk->comp)); + + VkComputePipelineCreateInfo cinfo = { + .sType = VK_STRUCTURE_TYPE_COMPUTE_PIPELINE_CREATE_INFO, + .stage = { + .sType = VK_STRUCTURE_TYPE_PIPELINE_SHADER_STAGE_CREATE_INFO, + .stage = VK_SHADER_STAGE_COMPUTE_BIT, + .module = pass_vk->comp, + .pName = "main", + }, + .layout = pass_vk->pipeLayout, + }; + + VK(vkCreateComputePipelines(vk->dev, pass_vk->pipeCache, 1, &cinfo, + MPVK_ALLOCATOR, &pass_vk->pipe)); + break; + } + } + + // Update cached program + bstr *prog = &pass->params.cached_program; + VK(vkGetPipelineCacheData(vk->dev, pass_vk->pipeCache, &prog->len, NULL)); + prog->start = talloc_size(pass, prog->len); + VK(vkGetPipelineCacheData(vk->dev, pass_vk->pipeCache, &prog->len, prog->start)); + + return pass; + +error: + vk_renderpass_destroy(ra, pass); + return NULL; +} + +static void vk_update_descriptor(struct vk_cmd *cmd, + struct ra_renderpass *pass, + struct ra_renderpass_input_val val, + VkDescriptorSet ds, int idx) +{ + struct ra_renderpass_vk *pass_vk = pass->priv; + struct ra_renderpass_input *inp = &pass->params.inputs[val.index]; + + VkWriteDescriptorSet *wds = &pass_vk->dswrite[idx]; + *wds = (VkWriteDescriptorSet) { + .sType = VK_STRUCTURE_TYPE_WRITE_DESCRIPTOR_SET, + .dstSet = ds, + .dstBinding = inp->binding, + .descriptorCount = 1, + .descriptorType = dsType[inp->type], + }; + + switch (inp->type) { + case RA_VARTYPE_TEX: { + struct ra_tex *tex = *(struct ra_tex **)val.data; + struct ra_tex_vk *tex_vk = tex->priv; + + assert(tex->params.render_src); + tex_barrier(cmd, tex_vk, VK_PIPELINE_STAGE_FRAGMENT_SHADER_BIT, + VK_ACCESS_SHADER_READ_BIT, + VK_IMAGE_LAYOUT_SHADER_READ_ONLY_OPTIMAL, false); + + VkDescriptorImageInfo *iinfo = &pass_vk->dsiinfo[idx]; + *iinfo = (VkDescriptorImageInfo) { + .sampler = tex_vk->sampler, + .imageView = tex_vk->view, + .imageLayout = tex_vk->current_layout, + }; + + wds->pImageInfo = iinfo; + break; + } + case RA_VARTYPE_IMG_W: { + struct ra_tex *tex = *(struct ra_tex **)val.data; + struct ra_tex_vk *tex_vk = tex->priv; + + assert(tex->params.storage_dst); + tex_barrier(cmd, tex_vk, VK_PIPELINE_STAGE_FRAGMENT_SHADER_BIT, + VK_ACCESS_SHADER_WRITE_BIT, + VK_IMAGE_LAYOUT_GENERAL, false); + + VkDescriptorImageInfo *iinfo = &pass_vk->dsiinfo[idx]; + *iinfo = (VkDescriptorImageInfo) { + .imageView = tex_vk->view, + .imageLayout = tex_vk->current_layout, + }; + + wds->pImageInfo = iinfo; + break; + } + case RA_VARTYPE_BUF_RO: + case RA_VARTYPE_BUF_RW: { + struct ra_buf *buf = *(struct ra_buf **)val.data; + struct ra_buf_vk *buf_vk = buf->priv; + + VkBufferUsageFlags access = VK_ACCESS_SHADER_READ_BIT; + if (inp->type == RA_VARTYPE_BUF_RW) + access |= VK_ACCESS_SHADER_WRITE_BIT; + + buf_barrier(cmd, buf, VK_PIPELINE_STAGE_FRAGMENT_SHADER_BIT, + access, buf_vk->slice.mem.offset, buf->params.size); + + VkDescriptorBufferInfo *binfo = &pass_vk->dsbinfo[idx]; + *binfo = (VkDescriptorBufferInfo) { + .buffer = buf_vk->slice.buf, + .offset = buf_vk->slice.mem.offset, + .range = buf->params.size, + }; + + wds->pBufferInfo = binfo; + break; + } + } +} + +static void vk_renderpass_run(struct ra *ra, + const struct ra_renderpass_run_params *params) +{ + struct mpvk_ctx *vk = vk_get(ra); + struct ra_renderpass *pass = params->pass; + struct ra_renderpass_vk *pass_vk = pass->priv; + + struct vk_cmd *cmd = vk_require_cmd(ra); + if (!cmd) + goto error; + + static const VkPipelineBindPoint bindPoint[] = { + [RA_RENDERPASS_TYPE_RASTER] = VK_PIPELINE_BIND_POINT_GRAPHICS, + [RA_RENDERPASS_TYPE_COMPUTE] = VK_PIPELINE_BIND_POINT_COMPUTE, + }; + + vkCmdBindPipeline(cmd->buf, bindPoint[pass->params.type], pass_vk->pipe); + + VkDescriptorSet ds = pass_vk->dss[pass_vk->dindex++]; + pass_vk->dindex %= MPVK_NUM_DS; + + for (int i = 0; i < params->num_values; i++) + vk_update_descriptor(cmd, pass, params->values[i], ds, i); + + if (params->num_values > 0) { + vkUpdateDescriptorSets(vk->dev, params->num_values, pass_vk->dswrite, + 0, NULL); + } + + vkCmdBindDescriptorSets(cmd->buf, bindPoint[pass->params.type], + pass_vk->pipeLayout, 0, 1, &ds, 0, NULL); + + switch (pass->params.type) { + case RA_RENDERPASS_TYPE_COMPUTE: + vkCmdDispatch(cmd->buf, params->compute_groups[0], + params->compute_groups[1], + params->compute_groups[2]); + break; + case RA_RENDERPASS_TYPE_RASTER: { + struct ra_tex *tex = params->target; + struct ra_tex_vk *tex_vk = tex->priv; + assert(tex->params.render_dst); + + struct ra_buf_params buf_params = { + .type = RA_BUF_TYPE_VERTEX, + .size = params->vertex_count * pass->params.vertex_stride, + .host_mutable = true, + }; + + struct ra_buf *buf = ra_buf_pool_get(ra, &pass_vk->vbo, &buf_params); + if (!buf) { + MP_ERR(ra, "Failed allocating vertex buffer!\n"); + goto error; + } + struct ra_buf_vk *buf_vk = buf->priv; + + vk_buf_update(ra, buf, 0, params->vertex_data, buf_params.size); + + buf_barrier(cmd, buf, VK_PIPELINE_STAGE_VERTEX_INPUT_BIT, + VK_ACCESS_VERTEX_ATTRIBUTE_READ_BIT, + buf_vk->slice.mem.offset, buf_params.size); + + vkCmdBindVertexBuffers(cmd->buf, 0, 1, &buf_vk->slice.buf, + &buf_vk->slice.mem.offset); + + tex_barrier(cmd, tex_vk, VK_PIPELINE_STAGE_COLOR_ATTACHMENT_OUTPUT_BIT, + VK_ACCESS_COLOR_ATTACHMENT_WRITE_BIT, + VK_IMAGE_LAYOUT_COLOR_ATTACHMENT_OPTIMAL, false); + + VkViewport viewport = { + .x = params->viewport.x0, + .y = params->viewport.y0, + .width = mp_rect_w(params->viewport), + .height = mp_rect_h(params->viewport), + }; + + VkRect2D scissor = { + .offset = {params->scissors.x0, params->scissors.y0}, + .extent = {mp_rect_w(params->scissors), mp_rect_h(params->scissors)}, + }; + + vkCmdSetViewport(cmd->buf, 0, 1, &viewport); + vkCmdSetScissor(cmd->buf, 0, 1, &scissor); + + VkRenderPassBeginInfo binfo = { + .sType = VK_STRUCTURE_TYPE_RENDER_PASS_BEGIN_INFO, + .renderPass = pass_vk->renderPass, + .framebuffer = tex_vk->framebuffer, + .renderArea = (VkRect2D){{0, 0}, {tex->params.w, tex->params.h}}, + }; + + vkCmdBeginRenderPass(cmd->buf, &binfo, VK_SUBPASS_CONTENTS_INLINE); + vkCmdDraw(cmd->buf, params->vertex_count, 1, 0, 0); + vkCmdEndRenderPass(cmd->buf); + break; + } + default: abort(); + }; + +error: + return; +} + +static void vk_blit(struct ra *ra, struct ra_tex *dst, struct ra_tex *src, + struct mp_rect *dst_rc, struct mp_rect *src_rc) +{ + assert(src->params.blit_src); + assert(dst->params.blit_dst); + + struct ra_tex_vk *src_vk = src->priv; + struct ra_tex_vk *dst_vk = dst->priv; + + struct vk_cmd *cmd = vk_require_cmd(ra); + if (!cmd) + return; + + tex_barrier(cmd, src_vk, VK_PIPELINE_STAGE_TRANSFER_BIT, + VK_ACCESS_TRANSFER_READ_BIT, + VK_IMAGE_LAYOUT_TRANSFER_SRC_OPTIMAL, + false); + + bool discard = dst_rc->x0 == 0 && + dst_rc->y0 == 0 && + dst_rc->x1 == dst->params.w && + dst_rc->y1 == dst->params.h; + + tex_barrier(cmd, dst_vk, VK_PIPELINE_STAGE_TRANSFER_BIT, + VK_ACCESS_TRANSFER_WRITE_BIT, + VK_IMAGE_LAYOUT_TRANSFER_DST_OPTIMAL, + discard); + + VkImageBlit region = { + .srcSubresource = vk_layers, + .srcOffsets = {{src_rc->x0, src_rc->y0, 0}, {src_rc->x1, src_rc->y1, 1}}, + .dstSubresource = vk_layers, + .dstOffsets = {{dst_rc->x0, dst_rc->y0, 0}, {dst_rc->x1, dst_rc->y1, 1}}, + }; + + vkCmdBlitImage(cmd->buf, src_vk->img, src_vk->current_layout, dst_vk->img, + dst_vk->current_layout, 1, ®ion, VK_FILTER_NEAREST); +} + +static void vk_clear(struct ra *ra, struct ra_tex *tex, float color[4], + struct mp_rect *rc) +{ + struct ra_vk *p = ra->priv; + struct ra_tex_vk *tex_vk = tex->priv; + assert(tex->params.blit_dst); + + struct vk_cmd *cmd = vk_require_cmd(ra); + if (!cmd) + return; + + struct mp_rect full = {0, 0, tex->params.w, tex->params.h}; + if (!rc || mp_rect_equals(rc, &full)) { + // To clear the entire image, we can use the efficient clear command + tex_barrier(cmd, tex_vk, VK_PIPELINE_STAGE_TRANSFER_BIT, + VK_ACCESS_TRANSFER_WRITE_BIT, + VK_IMAGE_LAYOUT_TRANSFER_DST_OPTIMAL, true); + + VkClearColorValue clearColor = {0}; + for (int c = 0; c < 4; c++) + clearColor.float32[c] = color[c]; + + vkCmdClearColorImage(cmd->buf, tex_vk->img, tex_vk->current_layout, + &clearColor, 1, &vk_range); + } else { + // To simulate per-region clearing, we blit from a 1x1 texture instead + struct ra_tex_upload_params ul_params = { + .tex = p->clear_tex, + .invalidate = true, + .src = &color[0], + }; + vk_tex_upload(ra, &ul_params); + vk_blit(ra, tex, p->clear_tex, rc, &(struct mp_rect){0, 0, 1, 1}); + } +} + +#define VK_QUERY_POOL_SIZE (MPVK_MAX_STREAMING_DEPTH * 4) + +struct vk_timer { + VkQueryPool pool; + int index; + uint64_t result; +}; + +static void vk_timer_destroy(struct ra *ra, ra_timer *ratimer) +{ + if (!ratimer) + return; + + struct mpvk_ctx *vk = vk_get(ra); + struct vk_timer *timer = ratimer; + + vkDestroyQueryPool(vk->dev, timer->pool, MPVK_ALLOCATOR); + + talloc_free(timer); +} + +MAKE_LAZY_DESTRUCTOR(vk_timer_destroy, ra_timer); + +static ra_timer *vk_timer_create(struct ra *ra) +{ + struct mpvk_ctx *vk = vk_get(ra); + + struct vk_timer *timer = talloc_zero(NULL, struct vk_timer); + + struct VkQueryPoolCreateInfo qinfo = { + .sType = VK_STRUCTURE_TYPE_QUERY_POOL_CREATE_INFO, + .queryType = VK_QUERY_TYPE_TIMESTAMP, + .queryCount = VK_QUERY_POOL_SIZE, + }; + + VK(vkCreateQueryPool(vk->dev, &qinfo, MPVK_ALLOCATOR, &timer->pool)); + + return (ra_timer *)timer; + +error: + vk_timer_destroy(ra, timer); + return NULL; +} + +static void vk_timer_start(struct ra *ra, ra_timer *ratimer) +{ + struct mpvk_ctx *vk = vk_get(ra); + struct vk_timer *timer = ratimer; + + struct vk_cmd *cmd = vk_require_cmd(ra); + if (!cmd) + return; + + timer->index = (timer->index + 2) % VK_QUERY_POOL_SIZE; + + uint64_t out[2]; + VkResult res = vkGetQueryPoolResults(vk->dev, timer->pool, timer->index, 2, + sizeof(out), &out[0], sizeof(uint64_t), + VK_QUERY_RESULT_64_BIT); + switch (res) { + case VK_SUCCESS: + timer->result = out[1] - out[0]; + break; + case VK_NOT_READY: + timer->result = 0; + break; + default: + MP_WARN(vk, "Failed reading timer query result: %s\n", vk_err(res)); + return; + }; + + vkCmdWriteTimestamp(cmd->buf, VK_PIPELINE_STAGE_BOTTOM_OF_PIPE_BIT, + timer->pool, timer->index); +} + +static uint64_t vk_timer_stop(struct ra *ra, ra_timer *ratimer) +{ + struct vk_timer *timer = ratimer; + struct vk_cmd *cmd = vk_require_cmd(ra); + + if (cmd) { + vkCmdWriteTimestamp(cmd->buf, VK_PIPELINE_STAGE_BOTTOM_OF_PIPE_BIT, + timer->pool, timer->index + 1); + } + + return timer->result; +} + +static struct ra_fns ra_fns_vk = { + .destroy = vk_destroy_ra, + .tex_create = vk_tex_create, + .tex_destroy = vk_tex_destroy_lazy, + .tex_upload = vk_tex_upload, + .buf_create = vk_buf_create, + .buf_destroy = vk_buf_destroy_lazy, + .buf_update = vk_buf_update, + .buf_poll = vk_buf_poll, + .clear = vk_clear, + .blit = vk_blit, + .renderpass_create = vk_renderpass_create, + .renderpass_destroy = vk_renderpass_destroy_lazy, + .renderpass_run = vk_renderpass_run, + .timer_create = vk_timer_create, + .timer_destroy = vk_timer_destroy_lazy, + .timer_start = vk_timer_start, + .timer_stop = vk_timer_stop, +}; + +static void present_cb(struct ra *ra, int *inflight) +{ + *inflight -= 1; +} + +bool ra_vk_present_frame(struct ra *ra, struct vk_swimg *swimg, int *inflight) +{ + struct ra_vk *p = ra->priv; + struct mpvk_ctx *vk = vk_get(ra); + assert(p->active_cmd); + + if (inflight) { + *inflight += 1; + vk_callback(ra, (vk_cb)present_cb, inflight); + } + + struct ra_tex *img = swimg->image; + + tex_barrier(p->active_cmd, img->priv, VK_PIPELINE_STAGE_BOTTOM_OF_PIPE_BIT, + 0, VK_IMAGE_LAYOUT_PRESENT_SRC_KHR, false); + + // These are the only two stages that we use/support for actually + // outputting to swapchain imagechain images, so just add a dependency + // on both of them. In theory, we could maybe come up with some more + // advanced mechanism of tracking dynamic dependencies, but that seems + // like overkill. + vk_cmd_dep(p->active_cmd, swimg->acquired, + VK_PIPELINE_STAGE_COLOR_ATTACHMENT_OUTPUT_BIT | + VK_PIPELINE_STAGE_TRANSFER_BIT); + + VkSemaphore done; + if (!vk_cmd_submit(vk, p->active_cmd, &done)) + goto error; + p->active_cmd = NULL; + + struct vk_cmdpool *pool = vk->pool; + VkQueue queue = pool->queues[pool->qindex]; + pool->qindex %= pool->qcount; + + VkPresentInfoKHR pinfo = { + .sType = VK_STRUCTURE_TYPE_PRESENT_INFO_KHR, + .waitSemaphoreCount = 1, + .pWaitSemaphores = &done, + .swapchainCount = 1, + .pSwapchains = &swimg->chain->swchain, + .pImageIndices = &swimg->index, + }; + + VK(vkQueuePresentKHR(queue, &pinfo)); + + return true; + +error: + return false; +} diff --git a/video/out/vulkan/ra_vk.h b/video/out/vulkan/ra_vk.h new file mode 100644 index 0000000000000..214a9af6f3552 --- /dev/null +++ b/video/out/vulkan/ra_vk.h @@ -0,0 +1,25 @@ +#pragma once + +#include "common.h" +#include "utils.h" +#include "video/out/opengl/ra.h" + +struct ra *ra_create_vk(struct mpvk_ctx *vk, struct mp_log *log); + +// Access to the VkDevice is needed for swapchain creation +VkDevice ra_vk_get_dev(struct ra *ra); + +// Allocates a ra_tex that wraps a swapchain image. The contents of the image +// will be invalidated, and access to it will only be internally synchronized. +// So the calling could should not do anything else with the VkImage. +struct ra_tex *ra_vk_wrap_swchain_img(struct ra *ra, VkImage vkimg, + VkSwapchainCreateInfoKHR info); + +// This function flushes the command buffers, and enqueues the image for +// presentation. This command must only be used after drawing to the vk_swchain, +// but before the command buffers are flushed for other reasons (for +// synchronization). The frames_in_flight pointer will be used to track how +// many frames are currently in flight. (That is, it will be incremented when +// this function is called, and decremented when the command completes) +bool ra_vk_present_frame(struct ra *ra, struct vk_swimg *swimg, + int *frames_in_flight); diff --git a/video/out/vulkan/utils.c b/video/out/vulkan/utils.c new file mode 100644 index 0000000000000..6c14bce2455d4 --- /dev/null +++ b/video/out/vulkan/utils.c @@ -0,0 +1,936 @@ +#include + +#include "utils.h" +#include "malloc.h" +#include "ra_vk.h" +#include "video/out/x11_common.h" + +const char* vk_err(VkResult res) +{ + switch (res) { + // These are technically success codes, but include them nonetheless + case VK_SUCCESS: return "VK_SUCCESS"; + case VK_NOT_READY: return "VK_NOT_READY"; + case VK_TIMEOUT: return "VK_TIMEOUT"; + case VK_EVENT_SET: return "VK_EVENT_SET"; + case VK_EVENT_RESET: return "VK_EVENT_RESET"; + case VK_INCOMPLETE: return "VK_INCOMPLETE"; + + // Actual error codes + case VK_ERROR_OUT_OF_HOST_MEMORY: return "VK_ERROR_OUT_OF_HOST_MEMORY"; + case VK_ERROR_OUT_OF_DEVICE_MEMORY: return "VK_ERROR_OUT_OF_DEVICE_MEMORY"; + case VK_ERROR_INITIALIZATION_FAILED: return "VK_ERROR_INITIALIZATION_FAILED"; + case VK_ERROR_DEVICE_LOST: return "VK_ERROR_DEVICE_LOST"; + case VK_ERROR_MEMORY_MAP_FAILED: return "VK_ERROR_MEMORY_MAP_FAILED"; + case VK_ERROR_LAYER_NOT_PRESENT: return "VK_ERROR_LAYER_NOT_PRESENT"; + case VK_ERROR_EXTENSION_NOT_PRESENT: return "VK_ERROR_EXTENSION_NOT_PRESENT"; + case VK_ERROR_FEATURE_NOT_PRESENT: return "VK_ERROR_FEATURE_NOT_PRESENT"; + case VK_ERROR_INCOMPATIBLE_DRIVER: return "VK_ERROR_INCOMPATIBLE_DRIVER"; + case VK_ERROR_TOO_MANY_OBJECTS: return "VK_ERROR_TOO_MANY_OBJECTS"; + case VK_ERROR_FORMAT_NOT_SUPPORTED: return "VK_ERROR_FORMAT_NOT_SUPPORTED"; + case VK_ERROR_FRAGMENTED_POOL: return "VK_ERROR_FRAGMENTED_POOL"; + } + + return "Unknown error!"; +} + +static const char* vk_dbg_type(VkDebugReportObjectTypeEXT type) +{ + switch (type) { + case VK_DEBUG_REPORT_OBJECT_TYPE_INSTANCE_EXT: + return "VkInstance"; + case VK_DEBUG_REPORT_OBJECT_TYPE_PHYSICAL_DEVICE_EXT: + return "VkPhysicalDevice"; + case VK_DEBUG_REPORT_OBJECT_TYPE_DEVICE_EXT: + return "VkDevice"; + case VK_DEBUG_REPORT_OBJECT_TYPE_QUEUE_EXT: + return "VkQueue"; + case VK_DEBUG_REPORT_OBJECT_TYPE_SEMAPHORE_EXT: + return "VkSemaphore"; + case VK_DEBUG_REPORT_OBJECT_TYPE_COMMAND_BUFFER_EXT: + return "VkCommandBuffer"; + case VK_DEBUG_REPORT_OBJECT_TYPE_FENCE_EXT: + return "VkFence"; + case VK_DEBUG_REPORT_OBJECT_TYPE_DEVICE_MEMORY_EXT: + return "VkDeviceMemory"; + case VK_DEBUG_REPORT_OBJECT_TYPE_BUFFER_EXT: + return "VkBuffer"; + case VK_DEBUG_REPORT_OBJECT_TYPE_IMAGE_EXT: + return "VkImage"; + case VK_DEBUG_REPORT_OBJECT_TYPE_EVENT_EXT: + return "VkEvent"; + case VK_DEBUG_REPORT_OBJECT_TYPE_QUERY_POOL_EXT: + return "VkQueryPool"; + case VK_DEBUG_REPORT_OBJECT_TYPE_BUFFER_VIEW_EXT: + return "VkBufferView"; + case VK_DEBUG_REPORT_OBJECT_TYPE_IMAGE_VIEW_EXT: + return "VkImageView"; + case VK_DEBUG_REPORT_OBJECT_TYPE_SHADER_MODULE_EXT: + return "VkShaderModule"; + case VK_DEBUG_REPORT_OBJECT_TYPE_PIPELINE_CACHE_EXT: + return "VkPipelineCache"; + case VK_DEBUG_REPORT_OBJECT_TYPE_PIPELINE_LAYOUT_EXT: + return "VkPipelineLayout"; + case VK_DEBUG_REPORT_OBJECT_TYPE_RENDER_PASS_EXT: + return "VkRenderPass"; + case VK_DEBUG_REPORT_OBJECT_TYPE_PIPELINE_EXT: + return "VkPipeline"; + case VK_DEBUG_REPORT_OBJECT_TYPE_DESCRIPTOR_SET_LAYOUT_EXT: + return "VkDescriptorSetLayout"; + case VK_DEBUG_REPORT_OBJECT_TYPE_SAMPLER_EXT: + return "VkSampler"; + case VK_DEBUG_REPORT_OBJECT_TYPE_DESCRIPTOR_POOL_EXT: + return "VkDescriptorPool"; + case VK_DEBUG_REPORT_OBJECT_TYPE_DESCRIPTOR_SET_EXT: + return "VkDescriptorSet"; + case VK_DEBUG_REPORT_OBJECT_TYPE_FRAMEBUFFER_EXT: + return "VkFramebuffer"; + case VK_DEBUG_REPORT_OBJECT_TYPE_COMMAND_POOL_EXT: + return "VkCommandPool"; + case VK_DEBUG_REPORT_OBJECT_TYPE_SURFACE_KHR_EXT: + return "VkSurfaceKHR"; + case VK_DEBUG_REPORT_OBJECT_TYPE_SWAPCHAIN_KHR_EXT: + return "VkSwapchainKHR"; + case VK_DEBUG_REPORT_OBJECT_TYPE_DEBUG_REPORT_EXT: + return "VkDebugReportCallbackEXT"; + case VK_DEBUG_REPORT_OBJECT_TYPE_UNKNOWN_EXT: + default: + return "unknown object"; + } +} + +static VkBool32 vk_dbg_callback(VkDebugReportFlagsEXT flags, + VkDebugReportObjectTypeEXT objType, + uint64_t obj, size_t loc, int32_t msgCode, + const char *layer, const char *msg, void *priv) +{ + struct mpvk_ctx *vk = priv; + int lev = MSGL_V; + + switch (flags) { + case VK_DEBUG_REPORT_ERROR_BIT_EXT: lev = MSGL_ERR; break; + case VK_DEBUG_REPORT_WARNING_BIT_EXT: lev = MSGL_WARN; break; + case VK_DEBUG_REPORT_INFORMATION_BIT_EXT: lev = MSGL_TRACE; break; + case VK_DEBUG_REPORT_PERFORMANCE_WARNING_BIT_EXT: lev = MSGL_WARN; break; + case VK_DEBUG_REPORT_DEBUG_BIT_EXT: lev = MSGL_DEBUG; break; + }; + + MP_MSG(vk, lev, "vk [%s] %d: %s (obj 0x%lx (%s), loc 0x%lx)\n", + layer, msgCode, msg, obj, vk_dbg_type(objType), loc); + + // The return value of this function determines whether the call will + // be explicitly aborted (to prevent GPU errors) or not. In this case, + // we generally want this to be on for the errors. + return (flags & VK_DEBUG_REPORT_ERROR_BIT_EXT); +} + +void mpvk_uninit(struct mpvk_ctx *vk) +{ + if (!vk->inst) + return; + + if (vk->dev) { + struct vk_cmdpool *pool = vk->pool; + // also frees associated command buffers + vkDestroyCommandPool(vk->dev, pool->pool, MPVK_ALLOCATOR); + for (int n = 0; n < MPVK_MAX_CMDS; n++) { + vkDestroyFence(vk->dev, pool->cmds[n].fence, MPVK_ALLOCATOR); + vkDestroySemaphore(vk->dev, pool->cmds[n].done, MPVK_ALLOCATOR); + talloc_free(pool->cmds[n].callbacks); + } + talloc_free(vk->pool); + vk_malloc_uninit(vk); + vkDestroyDevice(vk->dev, MPVK_ALLOCATOR); + } + + if (vk->dbg) { + // Same deal as creating the debug callback, we need to load this + // first. + VK_LOAD_PFN(vkDestroyDebugReportCallbackEXT) + pfn_vkDestroyDebugReportCallbackEXT(vk->inst, vk->dbg, MPVK_ALLOCATOR); + } + + vkDestroySurfaceKHR(vk->inst, vk->surf, MPVK_ALLOCATOR); + vkDestroyInstance(vk->inst, MPVK_ALLOCATOR); + + *vk = (struct mpvk_ctx){0}; +} + +bool mpvk_instance_init(struct mpvk_ctx *vk, bool debug) +{ + VkInstanceCreateInfo info = { + .sType = VK_STRUCTURE_TYPE_INSTANCE_CREATE_INFO, + }; + + if (debug) { + // Enables the LunarG standard validation layer, which + // is a meta-layer that loads lots of other validators + static const char* layers[] = { + "VK_LAYER_LUNARG_standard_validation", + }; + + info.ppEnabledLayerNames = layers; + info.enabledLayerCount = MP_ARRAY_SIZE(layers); + } + + // Enable whatever extensions were compiled in. + static const char *extensions[] = { + VK_KHR_SURFACE_EXTENSION_NAME, +#if HAVE_VULKAN_XLIB + VK_KHR_XLIB_SURFACE_EXTENSION_NAME, +#endif + + // Extra extensions only used for debugging. These are toggled by + // decreasing the enabledExtensionCount, so the number needs to be + // synchronized with the code below. + VK_EXT_DEBUG_REPORT_EXTENSION_NAME, + }; + + const int debugExtensionCount = 1; + + info.ppEnabledExtensionNames = extensions; + info.enabledExtensionCount = MP_ARRAY_SIZE(extensions); + + if (!debug) + info.enabledExtensionCount -= debugExtensionCount; + + VkResult res = vkCreateInstance(&info, MPVK_ALLOCATOR, &vk->inst); + if (res != VK_SUCCESS) { + MP_VERBOSE(vk, "failed creating instance: %s\n", vk_err(res)); + return false; + } + + if (debug) { + // Set up a debug callback to catch validation messages + VkDebugReportCallbackCreateInfoEXT dinfo = { + .sType = VK_STRUCTURE_TYPE_DEBUG_REPORT_CALLBACK_CREATE_INFO_EXT, + .flags = VK_DEBUG_REPORT_INFORMATION_BIT_EXT | + VK_DEBUG_REPORT_WARNING_BIT_EXT | + VK_DEBUG_REPORT_PERFORMANCE_WARNING_BIT_EXT | + VK_DEBUG_REPORT_ERROR_BIT_EXT | + VK_DEBUG_REPORT_DEBUG_BIT_EXT, + .pfnCallback = vk_dbg_callback, + .pUserData = vk, + }; + + // Since this is not part of the core spec, we need to load it. This + // can't fail because we've already successfully created an instance + // with this extension enabled. + VK_LOAD_PFN(vkCreateDebugReportCallbackEXT) + pfn_vkCreateDebugReportCallbackEXT(vk->inst, &dinfo, MPVK_ALLOCATOR, + &vk->dbg); + } + + return true; +} + +#define MPVK_MAX_DEVICES 16 + +static bool physd_supports_surface(struct mpvk_ctx *vk, VkPhysicalDevice physd) +{ + uint32_t qfnum; + vkGetPhysicalDeviceQueueFamilyProperties(physd, &qfnum, NULL); + + for (int i = 0; i < qfnum; i++) { + VkBool32 sup; + VK(vkGetPhysicalDeviceSurfaceSupportKHR(physd, i, vk->surf, &sup)); + if (sup) + return true; + } + +error: + return false; +} + +bool mpvk_find_phys_device(struct mpvk_ctx *vk, const char *name, bool sw) +{ + assert(vk->surf); + + MP_VERBOSE(vk, "Probing for vulkan devices..\n"); + + VkPhysicalDevice *devices = NULL; + uint32_t num = 0; + VK(vkEnumeratePhysicalDevices(vk->inst, &num, NULL)); + devices = talloc_array(NULL, VkPhysicalDevice, num); + VK(vkEnumeratePhysicalDevices(vk->inst, &num, devices)); + + // Sorted by "priority". Reuses some m_opt code for convenience + static const struct m_opt_choice_alternatives types[] = { + {"discrete", VK_PHYSICAL_DEVICE_TYPE_DISCRETE_GPU}, + {"integrated", VK_PHYSICAL_DEVICE_TYPE_INTEGRATED_GPU}, + {"virtual", VK_PHYSICAL_DEVICE_TYPE_VIRTUAL_GPU}, + {"software", VK_PHYSICAL_DEVICE_TYPE_CPU}, + {"unknown", VK_PHYSICAL_DEVICE_TYPE_OTHER}, + {0} + }; + + VkPhysicalDeviceProperties props[MPVK_MAX_DEVICES]; + for (int i = 0; i < num; i++) { + vkGetPhysicalDeviceProperties(devices[i], &props[i]); + MP_VERBOSE(vk, "GPU %d: %s (%s)\n", i, props[i].deviceName, + m_opt_choice_str(types, props[i].deviceType)); + } + + // Iterate through each type in order of decreasing preference + for (int t = 0; types[t].name; t++) { + // Disallow SW rendering unless explicitly enabled + if (types[t].value == VK_PHYSICAL_DEVICE_TYPE_CPU && !sw) + continue; + + for (int i = 0; i < num; i++) { + VkPhysicalDeviceProperties prop = props[i]; + if (prop.deviceType != types[t].value) + continue; + if (name && strcmp(name, prop.deviceName) != 0) + continue; + if (!physd_supports_surface(vk, devices[i])) + continue; + + MP_VERBOSE(vk, "Found device:\n"); + MP_VERBOSE(vk, " Device Name: %s\n", prop.deviceName); + MP_VERBOSE(vk, " Device ID: %x:%x\n", prop.vendorID, prop.deviceID); + MP_VERBOSE(vk, " Driver version: %d\n", prop.driverVersion); + MP_VERBOSE(vk, " API version: %d.%d.%d\n", + VK_VERSION_MAJOR(prop.apiVersion), + VK_VERSION_MINOR(prop.apiVersion), + VK_VERSION_PATCH(prop.apiVersion)); + vk->physd = devices[i]; + vk->limits = prop.limits; + talloc_free(devices); + return true; + } + } + +error: + MP_VERBOSE(vk, "Found no suitable device, giving up.\n"); + talloc_free(devices); + return false; +} + +bool mpvk_pick_surface_format(struct mpvk_ctx *vk) +{ + assert(vk->physd); + + VkSurfaceFormatKHR *formats = NULL; + int num; + + // Enumerate through the surface formats and find one that we can map to + // a ra_format + VK(vkGetPhysicalDeviceSurfaceFormatsKHR(vk->physd, vk->surf, &num, NULL)); + formats = talloc_array(NULL, VkSurfaceFormatKHR, num); + VK(vkGetPhysicalDeviceSurfaceFormatsKHR(vk->physd, vk->surf, &num, formats)); + + for (int i = 0; i < num; i++) { + // A value of VK_FORMAT_UNDEFINED means we can pick anything we want + if (formats[i].format == VK_FORMAT_UNDEFINED) { + vk->surf_format = (VkSurfaceFormatKHR) { + .colorSpace = VK_COLOR_SPACE_SRGB_NONLINEAR_KHR, + .format = VK_FORMAT_R8G8B8A8_UNORM, + }; + break; + } + + if (formats[i].colorSpace != VK_COLOR_SPACE_SRGB_NONLINEAR_KHR) + continue; + + vk->surf_format = formats[i]; + break; + } + + talloc_free(formats); + + if (!vk->surf_format.format) + goto error; + + return true; + +error: + MP_ERR(vk, "Failed picking surface format!\n"); + talloc_free(formats); + return false; +} + +bool mpvk_surface_init(struct vo *vo, struct mpvk_ctx *vk) +{ + assert(vk->inst); + VkResult res; + +#if HAVE_VULKAN_XLIB + if (!vo_x11_init(vo)) + goto xlib_uninit; + + if (!vo_x11_create_vo_window(vo, NULL, "mpvk")) + goto xlib_uninit; + + VkXlibSurfaceCreateInfoKHR xinfo = { + .sType = VK_STRUCTURE_TYPE_XLIB_SURFACE_CREATE_INFO_KHR, + .dpy = vo->x11->display, + .window = vo->x11->window, + }; + + res = vkCreateXlibSurfaceKHR(vk->inst, &xinfo, MPVK_ALLOCATOR, &vk->surf); + if (res != VK_SUCCESS) { + MP_VERBOSE(vo, "Failed creating Xlib surface: %s\n", vk_err(res)); + goto xlib_uninit; + } + + MP_VERBOSE(vo, "Using Xlib surface.\n"); + return true; + +xlib_uninit: + vo_x11_uninit(vo); +#endif + + // If we're reached this point, then none of the above surface probes + // were successful + MP_ERR(vo, "Failed creating any useful vulkan surface!\n"); + return false; +} + +bool mpvk_device_init(struct mpvk_ctx *vk) +{ + assert(vk->physd); + + VkQueueFamilyProperties *qfs = NULL; + int qfnum; + + // Enumerate the queue families and find suitable families for each task + vkGetPhysicalDeviceQueueFamilyProperties(vk->physd, &qfnum, NULL); + qfs = talloc_array(NULL, VkQueueFamilyProperties, qfnum); + vkGetPhysicalDeviceQueueFamilyProperties(vk->physd, &qfnum, qfs); + + MP_VERBOSE(vk, "Queue families supported by device:\n"); + + for (int i = 0; i < qfnum; i++) { + MP_VERBOSE(vk, "QF %d: flags 0x%x num %d\n", i, qfs[i].queueFlags, + qfs[i].queueCount); + } + + // Since using multiple queue families is devilishly difficult, we just + // pick a single queue family and stick with it. So in the interest of this, + // it's best to pick the one that supports the most features. + + int idx = -1; + for (int i = 0; i < qfnum; i++) { + if (!(qfs[i].queueFlags & VK_QUEUE_GRAPHICS_BIT)) + continue; + + // QF supports more features + if (idx < 0 || qfs[i].queueFlags > qfs[idx].queueFlags) + idx = i; + + // QF supports more queues (at the same specialization level) + if (qfs[i].queueFlags == qfs[idx].queueFlags && + qfs[i].queueCount > qfs[idx].queueCount) + { + idx = i; + } + } + + // Vulkan requires at least one GRAPHICS queue, so if this fails something + // is horribly wrong. + assert(idx >= 0); + + // Now that we know which queue family we want, we can create the logical + // device + static const float priorities[MPVK_MAX_QUEUES] = {0}; + VkDeviceQueueCreateInfo qinfo = { + .sType = VK_STRUCTURE_TYPE_DEVICE_QUEUE_CREATE_INFO, + .queueFamilyIndex = idx, + .queueCount = MPMIN(qfs[idx].queueCount, MPVK_MAX_QUEUES), + .pQueuePriorities = priorities, + }; + + static const char *exts[] = { + VK_KHR_SWAPCHAIN_EXTENSION_NAME, + VK_NV_GLSL_SHADER_EXTENSION_NAME, + }; + + VkDeviceCreateInfo dinfo = { + .sType = VK_STRUCTURE_TYPE_DEVICE_CREATE_INFO, + .queueCreateInfoCount = 1, + .pQueueCreateInfos = &qinfo, + .ppEnabledExtensionNames = exts, + .enabledExtensionCount = MP_ARRAY_SIZE(exts), + }; + + MP_VERBOSE(vk, "Creating vulkan device...\n"); + VK(vkCreateDevice(vk->physd, &dinfo, MPVK_ALLOCATOR, &vk->dev)); + + vk_malloc_init(vk); + + // Create the vk_cmdpool and all required queues / synchronization objects + struct vk_cmdpool *pool = vk->pool = talloc_zero(NULL, struct vk_cmdpool); + *pool = (struct vk_cmdpool) { + .qf = qinfo.queueFamilyIndex, + .props = qfs[qinfo.queueFamilyIndex], + .qcount = qinfo.queueCount, + }; + + talloc_free(qfs); + + for (int n = 0; n < pool->qcount; n++) + vkGetDeviceQueue(vk->dev, pool->qf, n, &pool->queues[n]); + + VkCommandPoolCreateInfo cinfo = { + .sType = VK_STRUCTURE_TYPE_COMMAND_POOL_CREATE_INFO, + .flags = VK_COMMAND_POOL_CREATE_TRANSIENT_BIT | + VK_COMMAND_POOL_CREATE_RESET_COMMAND_BUFFER_BIT, + .queueFamilyIndex = pool->qf, + }; + + VK(vkCreateCommandPool(vk->dev, &cinfo, MPVK_ALLOCATOR, &pool->pool)); + + VkCommandBufferAllocateInfo ainfo = { + .sType = VK_STRUCTURE_TYPE_COMMAND_BUFFER_ALLOCATE_INFO, + .commandPool = pool->pool, + .level = VK_COMMAND_BUFFER_LEVEL_PRIMARY, + .commandBufferCount = MPVK_MAX_CMDS, + }; + + VkCommandBuffer cmdbufs[MPVK_MAX_CMDS]; + VK(vkAllocateCommandBuffers(vk->dev, &ainfo, cmdbufs)); + + for (int n = 0; n < MPVK_MAX_CMDS; n++) { + struct vk_cmd *cmd = &pool->cmds[n]; + cmd->pool = pool; + cmd->buf = cmdbufs[n]; + + VkFenceCreateInfo finfo = { + .sType = VK_STRUCTURE_TYPE_FENCE_CREATE_INFO, + .flags = VK_FENCE_CREATE_SIGNALED_BIT, + }; + + VK(vkCreateFence(vk->dev, &finfo, MPVK_ALLOCATOR, &cmd->fence)); + + VkSemaphoreCreateInfo sinfo = { + .sType = VK_STRUCTURE_TYPE_SEMAPHORE_CREATE_INFO, + }; + + VK(vkCreateSemaphore(vk->dev, &sinfo, MPVK_ALLOCATOR, &cmd->done)); + } + + // Ensure we can actually present to the surface using this queue + VkBool32 sup; + VK(vkGetPhysicalDeviceSurfaceSupportKHR(vk->physd, pool->qf, vk->surf, &sup)); + if (!sup) { + MP_ERR(vk, "Queue family does not support surface presentation!\n"); + goto error; + } + + return true; + +error: + MP_ERR(vk, "Failed creating logical device!\n"); + talloc_free(qfs); + return false; +} + +static void run_callbacks(struct vk_cmd *cmd) +{ + for (int i = 0; i < cmd->num_callbacks; i++) { + struct vk_callback *cb = &cmd->callbacks[i]; + cb->run(cb->priv, cb->arg); + *cb = (struct vk_callback){0}; + } + + cmd->num_callbacks = 0; +} + +static void wait_for_cmds(struct mpvk_ctx *vk, struct vk_cmd cmds[], int num) +{ + if (!num) + return; + + VkFence fences[MPVK_MAX_CMDS]; + for (int i = 0; i < num; i++) + fences[i] = cmds[i].fence; + + vkWaitForFences(vk->dev, num, fences, true, UINT64_MAX); + + for (int i = 0; i < num; i++) + run_callbacks(&cmds[i]); +} + +void mpvk_wait_idle(struct mpvk_ctx *vk) +{ + struct vk_cmdpool *pool = vk->pool; + + int idx = pool->cindex, pidx = pool->cindex_pending; + if (pidx < idx) { // range doesn't wrap + wait_for_cmds(vk, &pool->cmds[pidx], idx - pidx); + } else if (pidx > idx) { // range wraps + wait_for_cmds(vk, &pool->cmds[pidx], MPVK_MAX_CMDS - pidx); + wait_for_cmds(vk, &pool->cmds[0], idx); + } + pool->cindex_pending = pool->cindex; +} + +void mpvk_poll_cmds(struct mpvk_ctx *vk, struct vk_cmdpool *pool, + uint64_t timeout) +{ + // If requested, hard block until at least one command completes + if (timeout > 0 && pool->cindex_pending != pool->cindex) { + vkWaitForFences(vk->dev, 1, &pool->cmds[pool->cindex_pending].fence, + true, timeout); + } + + // Lazily garbage collect the commands based on their status + while (pool->cindex_pending != pool->cindex) { + struct vk_cmd *cmd = &pool->cmds[pool->cindex_pending]; + VkResult res = vkGetFenceStatus(vk->dev, cmd->fence); + if (res != VK_SUCCESS) + break; + run_callbacks(cmd); + pool->cindex_pending++; + pool->cindex_pending %= MPVK_MAX_CMDS; + } +} + +void vk_dev_callback(struct mpvk_ctx *vk, vk_cb callback, void *p, void *arg) +{ + struct vk_cmdpool *pool = vk->pool; + if (pool->cindex_pending == pool->cindex) { + // The device was already idle, so we can just immediately call it + callback(p, arg); + return; + } + + int prev_idx = pool->cindex - 1; + if (prev_idx < 0) + prev_idx += MPVK_MAX_CMDS; + + struct vk_cmd *last_cmd = &pool->cmds[prev_idx]; + vk_cmd_callback(last_cmd, callback, p, arg); +} + +const VkImageSubresourceRange vk_range = { + .aspectMask = VK_IMAGE_ASPECT_COLOR_BIT, + .levelCount = 1, + .layerCount = 1, +}; + +const VkImageSubresourceLayers vk_layers = { + .aspectMask = VK_IMAGE_ASPECT_COLOR_BIT, + .layerCount = 1, +}; + +void vk_cmd_callback(struct vk_cmd *cmd, vk_cb callback, void *p, void *arg) +{ + MP_TARRAY_GROW(NULL, cmd->callbacks, cmd->num_callbacks); + cmd->callbacks[cmd->num_callbacks++] = (struct vk_callback) { + .run = callback, + .priv = p, + .arg = arg, + }; +} + +void vk_cmd_dep(struct vk_cmd *cmd, VkSemaphore dep, + VkPipelineStageFlagBits depstage) +{ + assert(cmd->num_deps < MPVK_MAX_CMD_DEPS); + cmd->deps[cmd->num_deps] = dep; + cmd->depstages[cmd->num_deps++] = depstage; +} + +struct vk_cmd *vk_cmd_begin(struct mpvk_ctx *vk, struct vk_cmdpool *pool) +{ + // Garbage collect the cmdpool first + mpvk_poll_cmds(vk, pool, 0); + + int next = (pool->cindex + 1) % MPVK_MAX_CMDS; + if (next == pool->cindex_pending) { + MP_ERR(vk, "No free command buffers!\n"); + goto error; + } + + struct vk_cmd *cmd = &pool->cmds[pool->cindex]; + pool->cindex = next; + + VK(vkResetCommandBuffer(cmd->buf, 0)); + + VkCommandBufferBeginInfo binfo = { + .sType = VK_STRUCTURE_TYPE_COMMAND_BUFFER_BEGIN_INFO, + .flags = VK_COMMAND_BUFFER_USAGE_ONE_TIME_SUBMIT_BIT, + }; + + VK(vkBeginCommandBuffer(cmd->buf, &binfo)); + + return cmd; + +error: + return NULL; +} + +bool vk_cmd_submit(struct mpvk_ctx *vk, struct vk_cmd *cmd, VkSemaphore *done) +{ + VK(vkEndCommandBuffer(cmd->buf)); + + struct vk_cmdpool *pool = cmd->pool; + VkQueue queue = pool->queues[pool->qindex++]; + pool->qindex %= pool->qcount; + + VkSubmitInfo sinfo = { + .sType = VK_STRUCTURE_TYPE_SUBMIT_INFO, + .commandBufferCount = 1, + .pCommandBuffers = &cmd->buf, + .waitSemaphoreCount = cmd->num_deps, + .pWaitSemaphores = cmd->deps, + .pWaitDstStageMask = cmd->depstages, + }; + + if (done) { + sinfo.signalSemaphoreCount = 1; + sinfo.pSignalSemaphores = &cmd->done; + *done = cmd->done; + } + + VK(vkResetFences(vk->dev, 1, &cmd->fence)); + VK(vkQueueSubmit(queue, 1, &sinfo, cmd->fence)); + MP_TRACE(vk, "Submitted command on queue %p\n", (void *)queue); + + for (int i = 0; i < cmd->num_deps; i++) + cmd->deps[i] = NULL; + cmd->num_deps = 0; + + return true; + +error: + return false; +} + +static bool vk_swchain_update_info(struct vk_swchain *chain, + VkSwapchainCreateInfoKHR *info) +{ + struct mpvk_ctx *vk = chain->vk; + + // Query the supported capabilities and update this struct as needed + VkSurfaceCapabilitiesKHR caps; + VK(vkGetPhysicalDeviceSurfaceCapabilitiesKHR(vk->physd, vk->surf, &caps)); + + // Sorted by preference + static const VkCompositeAlphaFlagBitsKHR alphaModes[] = { + VK_COMPOSITE_ALPHA_POST_MULTIPLIED_BIT_KHR, + VK_COMPOSITE_ALPHA_OPAQUE_BIT_KHR, + }; + + for (int i = 0; i < MP_ARRAY_SIZE(alphaModes); i++) { + if (caps.supportedCompositeAlpha & alphaModes[i]) { + info->compositeAlpha = alphaModes[i]; + break; + } + } + + if (!info->compositeAlpha) { + MP_ERR(vk, "Failed picking alpha compositing mode (caps: %d)\n", + caps.supportedCompositeAlpha); + goto error; + } + + static const VkSurfaceTransformFlagBitsKHR rotModes[] = { + VK_SURFACE_TRANSFORM_IDENTITY_BIT_KHR, + VK_SURFACE_TRANSFORM_INHERIT_BIT_KHR, + }; + + for (int i = 0; i < MP_ARRAY_SIZE(rotModes); i++) { + if (caps.supportedTransforms & rotModes[i]) { + info->preTransform = rotModes[i]; + break; + } + } + + if (!info->preTransform) { + MP_ERR(vk, "Failed picking surface transform mode (caps: %d)\n", + caps.supportedTransforms); + goto error; + } + + // Image count as required + info->minImageCount = MPMAX(info->minImageCount, caps.minImageCount); + if (caps.maxImageCount) + info->minImageCount = MPMIN(info->minImageCount, caps.maxImageCount); + + // Check the extend against the allowed parameters + if (caps.currentExtent.width != info->imageExtent.width && + caps.currentExtent.width != 0xFFFFFFFF) + { + MP_WARN(vk, "Requested width %d does not match current width %d\n", + info->imageExtent.width, caps.currentExtent.width); + info->imageExtent.width = caps.currentExtent.width; + } + + if (caps.currentExtent.height != info->imageExtent.height && + caps.currentExtent.height != 0xFFFFFFFF) + { + MP_WARN(vk, "Requested height %d does not match current height %d\n", + info->imageExtent.height, caps.currentExtent.height); + info->imageExtent.height = caps.currentExtent.height; + } + + if (caps.minImageExtent.width > info->imageExtent.width || + caps.minImageExtent.height > info->imageExtent.height) + { + MP_ERR(vk, "Requested size %dx%d smaller than device minimum %d%d\n", + info->imageExtent.width, info->imageExtent.height, + caps.minImageExtent.width, caps.minImageExtent.height); + goto error; + } + + if (caps.maxImageExtent.width < info->imageExtent.width || + caps.maxImageExtent.height < info->imageExtent.height) + { + MP_ERR(vk, "Requested size %dx%d larger than device maximum %d%d\n", + info->imageExtent.width, info->imageExtent.height, + caps.maxImageExtent.width, caps.maxImageExtent.height); + goto error; + } + + // We just request whatever usage we can, and let the ra_vk decide what + // ra_tex_params that translates to. This makes the images as flexible + // as possible. + info->imageUsage = caps.supportedUsageFlags; + return true; + +error: + return false; +} + +bool vk_swchain_init(struct mpvk_ctx *vk, struct ra *ra, int size, + struct vk_swchain *chain) +{ + assert(vk->dev); + assert(vk->surf_format.format); + + struct VkSwapchainCreateInfoKHR dummy = { + .sType = VK_STRUCTURE_TYPE_SWAPCHAIN_CREATE_INFO_KHR, + .surface = vk->surf, + .minImageCount = size, + .imageFormat = vk->surf_format.format, + .imageColorSpace = vk->surf_format.colorSpace, + .imageArrayLayers = 1, // non-stereoscopic + .imageSharingMode = VK_SHARING_MODE_EXCLUSIVE, + .presentMode = VK_PRESENT_MODE_FIFO_KHR, + .clipped = true, + }; + + *chain = (struct vk_swchain) { + .vk = vk, + .ra = ra, + .protoInfo = dummy, + }; + + return true; +} + +void vk_swchain_uninit(struct ra *ra, struct vk_swchain *chain) +{ + struct mpvk_ctx *vk = chain->vk; + if (!vk) + return; + + // Note: We technically don't even need the struct *ra, it's just there + // to "force" the correct uninitialization order at the API level. Either + // way, make sure the RA actually matches.. + assert(ra == chain->ra); + + mpvk_wait_idle(vk); + + for (int i = 0; i < chain->num_images; i++) + ra_tex_free(ra, &chain->images[i]); + for (int i = 0; i < chain->num_acquired; i++) + vkDestroySemaphore(vk->dev, chain->acquired[i], MPVK_ALLOCATOR); + + vkDestroySwapchainKHR(vk->dev, chain->swchain, MPVK_ALLOCATOR); + + talloc_free(chain->images); + talloc_free(chain->acquired); + *chain = (struct vk_swchain){0}; +} + +static void destroy_swapchain(struct mpvk_ctx *vk, VkSwapchainKHR swchain) +{ + vkDestroySwapchainKHR(vk->dev, swchain, MPVK_ALLOCATOR); +} + +bool vk_swchain_resize(struct vk_swchain *chain, int w, int h) +{ + if (w == chain->w && h == chain->h) + return true; + + struct mpvk_ctx *vk = chain->vk; + VkImage *vkimages = NULL; + bool ret = false; + + VkSwapchainCreateInfoKHR sinfo = chain->protoInfo; + sinfo.imageExtent = (VkExtent2D){ w, h }; + sinfo.oldSwapchain = chain->swchain; + + if (!vk_swchain_update_info(chain, &sinfo)) + goto error; + + VK(vkCreateSwapchainKHR(vk->dev, &sinfo, MPVK_ALLOCATOR, &chain->swchain)); + chain->w = w; + chain->h = h; + + // Freeing the old swapchain while it's still in use is an error, so do + // it asynchronously once the device is idle. + if (sinfo.oldSwapchain) + vk_dev_callback(vk, (vk_cb) destroy_swapchain, vk, sinfo.oldSwapchain); + + // Get the new swapchain images + int num; + VK(vkGetSwapchainImagesKHR(vk->dev, chain->swchain, &num, NULL)); + vkimages = talloc_array(NULL, VkImage, num); + VK(vkGetSwapchainImagesKHR(vk->dev, chain->swchain, &num, vkimages)); + + // If needed, allocate some more semaphores + while (num > chain->num_acquired) { + VkSemaphore sem; + static const VkSemaphoreCreateInfo seminfo = { + .sType = VK_STRUCTURE_TYPE_SEMAPHORE_CREATE_INFO, + }; + VK(vkCreateSemaphore(vk->dev, &seminfo, MPVK_ALLOCATOR, &sem)); + MP_TARRAY_APPEND(NULL, chain->acquired, chain->num_acquired, sem); + } + + // Recreate the ra_tex wrappers + for (int i = 0; i < chain->num_images; i++) + ra_tex_free(chain->ra, &chain->images[i]); + + chain->num_images = num; + MP_TARRAY_GROW(NULL, chain->images, chain->num_images); + for (int i = 0; i < num; i++) { + chain->images[i] = ra_vk_wrap_swchain_img(chain->ra, vkimages[i], sinfo); + if (!chain->images[i]) + goto error; + } + + ret = true; + +error: + talloc_free(vkimages); + return ret; +} + +bool vk_swchain_get(struct vk_swchain *chain, struct vk_swimg *out) +{ + struct mpvk_ctx *vk = chain->vk; + + int semidx = chain->idx_acquired++; + chain->idx_acquired %= chain->num_acquired; + + uint32_t imgidx = 0; + VK(vkAcquireNextImageKHR(vk->dev, chain->swchain, UINT64_MAX, + chain->acquired[semidx], NULL, &imgidx)); + + *out = (struct vk_swimg) { + .chain = chain, + .index = imgidx, + .image = chain->images[imgidx], + .acquired = chain->acquired[semidx], + }; + return true; + +error: + return false; +} diff --git a/video/out/vulkan/utils.h b/video/out/vulkan/utils.h new file mode 100644 index 0000000000000..6273ebca95ef4 --- /dev/null +++ b/video/out/vulkan/utils.h @@ -0,0 +1,178 @@ +#pragma once + +#include "video/out/vo.h" +#include "video/mp_image.h" + +#include "common.h" +#include "formats.h" + +#define VK_LOAD_PFN(name) PFN_##name pfn_##name = (PFN_##name) \ + vkGetInstanceProcAddr(vk->inst, #name); + +// Return a human-readable name for various struct mpvk_ctx enums +const char* vk_err(VkResult res); + +// Convenience macros to simplify a lot of common boilerplate +#define VK_ASSERT(res, str) \ + if (res != VK_SUCCESS) { \ + MP_ERR(vk, str ": %s\n", vk_err(res)); \ + goto error; \ + } + +#define VK(cmd) \ + { \ + MP_TRACE(vk, #cmd "\n"); \ + VkResult res ## __LINE__ = (cmd); \ + VK_ASSERT(res ## __LINE__, #cmd); \ + } + +// Uninits everything in the correct order +void mpvk_uninit(struct mpvk_ctx *vk); + +// Initialization functions: As a rule of thumb, these need to be called in +// this order, followed by vk_malloc_init, followed by RA initialization, and +// finally followed by vk_swchain initialization. + +// Create a vulkan instance. Returns VK_NULL_HANDLE on failure +bool mpvk_instance_init(struct mpvk_ctx *vk, bool validate); + +// Generate a VkSurfaceKHR usable for video output. Returns VK_NULL_HANDLE on +// failure. Must be called after mpvk_instance_init. +bool mpvk_surface_init(struct vo *vo, struct mpvk_ctx *vk); + +// Find a suitable physical device for use with rendering and which supports +// the surface. +// name: only match a device with this name +// sw: also allow software/virtual devices +bool mpvk_find_phys_device(struct mpvk_ctx *vk, const char *name, bool sw); + +// Pick a suitable surface format that's supported by this physical device. +bool mpvk_pick_surface_format(struct mpvk_ctx *vk); + +// Create a logical device and initialize the vk_cmdpools +bool mpvk_device_init(struct mpvk_ctx *vk); + +// Wait until all commands submitted to all queues have completed +void mpvk_wait_idle(struct mpvk_ctx *vk); + +// Wait until at least one command submitted to any queue has completed, and +// process the callbacks. Good for event loops that need to delay until a +// command completes. Will block at most `timeout` nanoseconds. If used with +// 0, it only garbage collects completed commands without blocking. +void mpvk_poll_cmds(struct mpvk_ctx *vk, struct vk_cmdpool *pool, + uint64_t timeout); + +// Predefined structs for a simple non-layered, non-mipped image +extern const VkImageSubresourceRange vk_range; +extern const VkImageSubresourceLayers vk_layers; + +// Since lots of vulkan operations need to be done lazily once the affected +// resources are no longer in use, provide an abstraction for tracking these. +// In practice, these are only checked and run when submitting new commands, so +// the actual execution may be delayed by a frame. +typedef void (*vk_cb)(void *priv, void *arg); + +struct vk_callback { + vk_cb run; + void *priv; + void *arg; // as a convenience, you also get to pass an arg for "free" +}; + +// Associate a callback with the completion of all currently pending commands. +// This will essentially run once the device is completely idle. +void vk_dev_callback(struct mpvk_ctx *vk, vk_cb callback, void *p, void *arg); + +#define MPVK_MAX_CMD_DEPS 8 + +// Helper wrapper around command buffers that also track dependencies, +// callbacks and synchronization primitives +struct vk_cmd { + struct vk_cmdpool *pool; // pool it was allocated from + VkCommandBuffer buf; + VkFence fence; // the fence guards cmd buffer reuse + VkSemaphore done; // the semaphore signals when execution is done + // The semaphores represent dependencies that need to complete before + // this command can be executed. These are *not* owned by the vk_cmd + VkSemaphore deps[MPVK_MAX_CMD_DEPS]; + VkPipelineStageFlags depstages[MPVK_MAX_CMD_DEPS]; + int num_deps; + // Since VkFences are useless, we have to manually track "callbacks" + // to fire once the VkFence completes. These are used for multiple purposes, + // ranging from garbage collection (resource deallocation) to fencing. + struct vk_callback *callbacks; + int num_callbacks; +}; + +// Associate a callback with the completion of the current command. This +// bool will be set to `true` once the command completes, or shortly thereafter. +void vk_cmd_callback(struct vk_cmd *cmd, vk_cb callback, void *p, void *arg); + +// Associate a dependency for the current command. This semaphore must signal +// by the corresponding stage before the command may execute. +void vk_cmd_dep(struct vk_cmd *cmd, VkSemaphore dep, + VkPipelineStageFlagBits depstage); + +#define MPVK_MAX_QUEUES 8 +#define MPVK_MAX_CMDS 16 + +// Command pool / queue family hybrid abstraction +struct vk_cmdpool { + VkQueueFamilyProperties props; + uint32_t qf; // queue family index + VkCommandPool pool; + VkQueue queues[MPVK_MAX_QUEUES]; + int qcount; + int qindex; + // Command buffers associated with this queue. (No, VkCommandPool is not + // a pool of command buffers), you still have to pool them manually. We + // also have to track of "in flight" (pending) command buffers separately + // to work around vkQueueWaitIdle being completely fucking useless when + // using a queue for presentation. + struct vk_cmd cmds[MPVK_MAX_CMDS]; + int cindex; + int cindex_pending; +}; + +// Fetch the next command buffer from a command pool and begin recording to it. +// Returns NULL on failure. +struct vk_cmd *vk_cmd_begin(struct mpvk_ctx *vk, struct vk_cmdpool *pool); + +// Finish the currently recording command buffer and submit it for execution. +// If `done` is not NULL, it will be set to a semaphore that will signal once +// the command completes. (And MUST have a corresponding semaphore wait) +// Returns whether successful. +bool vk_cmd_submit(struct mpvk_ctx *vk, struct vk_cmd *cmd, VkSemaphore *done); + +// Swapchain +struct vk_swchain { + struct mpvk_ctx *vk; + struct ra *ra; + int w, h; // current size + VkSwapchainCreateInfoKHR protoInfo; // partially filled-in prototype + VkSwapchainKHR swchain; + // state of the images: + struct ra_tex **images; // ra_tex wrappers for the vkimages + int num_images; // size of images + VkSemaphore *acquired; // pool of semaphores used to synchronize images + int num_acquired; // size of this pool + int idx_acquired; // index of next free semaphore within this pool +}; + +// depth: desired depth +bool vk_swchain_init(struct mpvk_ctx *vk, struct ra *ra, int depth, + struct vk_swchain *chain); +void vk_swchain_uninit(struct ra *ra, struct vk_swchain *chain); +bool vk_swchain_resize(struct vk_swchain *chain, int w, int h); + +// Swapchain image +struct vk_swimg { + struct vk_swchain *chain; // vk_swchain it was allocated from + int index; // index within that vk_swchain + struct ra_tex *image; // ra_tex wrapper for the this image + VkSemaphore acquired; // will be signalled once the image is ready +}; + +// Get the next vk_swimg. This may block if the swapchain images are exceeded, +// but normally the user should allocate a larger swapchain than what they +// actually use. +bool vk_swchain_get(struct vk_swchain *chain, struct vk_swimg *out); diff --git a/wscript b/wscript index 9d885884d7476..42e5e726650da 100644 --- a/wscript +++ b/wscript @@ -780,6 +780,16 @@ video_output_features = [ 'fmsg': "No OpenGL video output found or enabled. " + "Aborting. If you really mean to compile without OpenGL " + "video outputs use --disable-gl." + }, { + 'name': '--vulkan-xlib', + 'desc': 'Vulkan Xlib backend', + 'func': check_true, + 'deps': ['x11'], + }, { + 'name': '--vulkan', + 'desc': 'Vulkan video output', + 'deps_any': [ 'vulkan-xlib' ], + 'func': check_cc(header_name='vulkan/vulkan.h', lib='vulkan'), }, { 'name': 'egl-helpers', 'desc': 'EGL helper functions', diff --git a/wscript_build.py b/wscript_build.py index 3c5c00dc6415e..878b1faf02513 100644 --- a/wscript_build.py +++ b/wscript_build.py @@ -437,12 +437,17 @@ def build(ctx): ( "video/out/vo_tct.c" ), ( "video/out/vo_vaapi.c", "vaapi-x11" ), ( "video/out/vo_vdpau.c", "vdpau" ), + ( "video/out/vo_vulkan.c", "vulkan" ), ( "video/out/vo_wayland.c", "wayland" ), ( "video/out/vo_x11.c" , "x11" ), ( "video/out/vo_xv.c", "xv" ), ( "video/out/w32_common.c", "win32-desktop" ), ( "video/out/win32/displayconfig.c", "win32-desktop" ), ( "video/out/win32/droptarget.c", "win32-desktop" ), + ( "video/out/vulkan/utils.c", "vulkan" ), + ( "video/out/vulkan/malloc.c", "vulkan" ), + ( "video/out/vulkan/formats.c", "vulkan" ), + ( "video/out/vulkan/ra_vk.c", "vulkan" ), ( "video/out/win32/exclusive_hack.c", "gl-win32" ), ( "video/out/wayland_common.c", "wayland" ), ( "video/out/wayland/buffer.c", "wayland" ),