normalize codec code:

* csc modules now also have an "init_module()" * nvcuda caches kernels (same as was done for nvenc) * opencl kernel: rename "init_context" to "select_device" git-svn-id: https://xpra.org/svn/Xpra/trunk@4726 3bb7dfac-3a0b-4e04-842a-767bc560f471
Xpra-org · Nov 10, 2013 · 196a436 · 196a436
1 parent 0162f8f
commit 196a436
Show file tree

Hide file tree

Showing 4 changed files with 94 additions and 57 deletions.
diff --git a/src/xpra/codecs/csc_nvcuda/colorspace_converter.py b/src/xpra/codecs/csc_nvcuda/colorspace_converter.py
@@ -18,26 +18,62 @@
 assert bytearray
 import pycuda                               #@UnresolvedImport
 from pycuda import driver                   #@UnresolvedImport
-from pycuda.compiler import SourceModule    #@UnresolvedImport
+from pycuda.compiler import compile         #@UnresolvedImport
 driver.init()
 
+
+DEFAULT_CUDA_DEVICE_ID = int(os.environ.get("XPRA_CUDA_DEVICE", "0"))
+COLORSPACES_MAP = {
+                   "BGRA" : ("YUV420P", "YUV422P", "YUV444P"),
+                   "BGRX" : ("YUV420P", "YUV422P", "YUV444P"),
+                   "RGBA" : ("YUV420P", "YUV422P", "YUV444P"),
+                   "RGBX" : ("YUV420P", "YUV422P", "YUV444P"),
+                   }
+KERNELS_MAP = {}
+
+
 def log_sys_info():
     log.info("PyCUDA version=%s", ".".join([str(x) for x in driver.get_version()]))
     log.info("PyCUDA driver version=%s", driver.get_driver_version())
 
 def device_info(d):
     return "%s @ %s" % (d.name(), d.pci_bus_id())
 
-DEFAULT_CUDA_DEVICE_ID = int(os.environ.get("XPRA_CUDA_DEVICE", "0"))
+def roundup(n, m):
+    return (n + m - 1) & ~(m - 1)
+
+
+#cache pre-compiled kernel cubins per device:
+KERNEL_cubins = {}
+def get_CUDA_kernel(device_id, src_format, dst_format):
+    start = time.time()
+    k = KERNELS_MAP.get((src_format, dst_format))
+    assert k is not None, "no kernel found for %s to %s" % (src_format, dst_format)
+    function_name, ksrc = k
+    global KERNEL_cubins
+    cubin = KERNEL_cubins.get((device_id, function_name))
+    if cubin is None:
+        debug("compiling for device %s: %s=%s", device_id, function_name, ksrc)
+        cubin = compile(ksrc)
+        KERNEL_cubins[(device_id, function_name)] = cubin
+    #now load from cubin:
+    mod = driver.module_from_buffer(cubin)
+    CUDA_function = mod.get_function(function_name)
+    end = time.time()
+    debug("compilation of %s took %.1fms", function_name, 1000.0*(end-start))
+    return function_name, CUDA_function
+
 
 selected_device = None
+selected_device_id = None
 def select_device():
-    global selected_device
+    global selected_device, selected_device_id
     if selected_device is not None:
-        return selected_device
+        return selected_device_id, selected_device
     ngpus = driver.Device.count()
     log.info("PyCUDA found %s device(s):", ngpus)
     device = None
+    device_id = -1
     for i in range(ngpus):
         d = driver.Device(i)
         host_mem = d.get_attribute(driver.device_attribute.CAN_MAP_HOST_MEMORY)
@@ -50,9 +86,11 @@ def select_device():
         #debug("compute_capability=%s, attributes=%s", d.compute_capability(), attr)
         if host_mem and (device is None or i==DEFAULT_CUDA_DEVICE_ID):
             device = d
+            device_id = i
     selected_device = device
-    return selected_device
-assert select_device() is not None, "no valid CUDA devices found"
+    selected_device_id = device_id
+    assert selected_device is not None, "no valid CUDA devices found"
+    return selected_device_id, selected_device
 
 context = None
 context_wrapper = None
@@ -70,38 +108,32 @@ def cleanup(self):
             self.context.detach()
             self.context = None
 
-def init_context():
+
+def init_module():
     global context, context_wrapper
     if context_wrapper is not None:
         return
     log_sys_info()
-    device = select_device()
+    device_id, device = select_device()
     context = device.make_context(flags=driver.ctx_flags.SCHED_YIELD | driver.ctx_flags.MAP_HOST)
     debug("testing with context=%s", context)
     debug("api version=%s", context.get_api_version())
     free, total = driver.mem_get_info()
     debug("using device %s",  device_info(device))
     debug("memory: free=%sMB, total=%sMB",  int(free/1024/1024), int(total/1024/1024))
-    #context.pop()
     context_wrapper = CudaContextWrapper(context)
-    context.pop()
 
-def roundup(n, m):
-    return (n + m - 1) & ~(m - 1)
+    #generate kernel sources:
+    for rgb_format, yuv_formats in COLORSPACES_MAP.items():
+        m = gen_rgb_to_yuv_kernels(rgb_format, yuv_formats)
+        KERNELS_MAP.update(m)
+    _kernel_names_ = sorted(set([x[0] for x in KERNELS_MAP.values()]))
+    log.info("%s csc_nvcuda kernels: %s", len(_kernel_names_), ", ".join(_kernel_names_))
 
-
-COLORSPACES_MAP = {
-                   "BGRA" : ("YUV420P", "YUV422P", "YUV444P"),
-                   "BGRX" : ("YUV420P", "YUV422P", "YUV444P"),
-                   "RGBA" : ("YUV420P", "YUV422P", "YUV444P"),
-                   "RGBX" : ("YUV420P", "YUV422P", "YUV444P"),
-                   }
-KERNELS_MAP = {}
-for rgb_format, yuv_formats in COLORSPACES_MAP.items():
-    m = gen_rgb_to_yuv_kernels(rgb_format, yuv_formats)
-    KERNELS_MAP.update(m)
-_kernel_names_ = sorted(set([x[0] for x in KERNELS_MAP.values()]))
-log.info("%s csc_nvcuda kernels: %s", len(_kernel_names_), ", ".join(_kernel_names_))
+    #now, pre-compile the kernels:
+    for src_format, dst_format in KERNELS_MAP.keys():
+        get_CUDA_kernel(device_id, src_format, dst_format)
+    context.pop()
 
 def get_type():
     return "nvcuda"
@@ -135,6 +167,7 @@ def __init__(self):
         self.dst_width = 0
         self.dst_height = 0
         self.dst_format = ""
+        self.device_id = 0
         self.time = 0
         self.frames = 0
         self.cuda_device = None
@@ -156,25 +189,20 @@ def init_context(self, src_width, src_height, src_format,
         self.dst_format = dst_format
         assert self.src_width==self.dst_width and self.src_height==self.dst_height, "scaling is not supported! (%sx%s to %sx%s)" % (self.src_width, self.src_height, self.dst_width, self.dst_height)
 
+        self.device_id = DEFAULT_CUDA_DEVICE_ID
         self.init_cuda(0)
 
-    def init_cuda(self, device_id):
-        debug("init_cuda(%s)", device_id)
-        self.cuda_device = driver.Device(DEFAULT_CUDA_DEVICE_ID)
+    def init_cuda(self):
+        debug("init_cuda() device_id=%s", self.device_id)
+        self.cuda_device = driver.Device(self.device_id)
         self.cuda_context = self.cuda_device.make_context(flags=driver.ctx_flags.SCHED_AUTO | driver.ctx_flags.MAP_HOST)
         #use alias to make code easier to read:
         d = self.cuda_device
         da = driver.device_attribute
         try:
-            debug("init_cuda(%s) cuda_device=%s, cuda_context=%s, thread=%s", device_id, self.cuda_device, self.cuda_context, threading.current_thread())
+            debug("init_cuda() cuda_device=%s, cuda_context=%s, thread=%s", self.cuda_device, self.cuda_context, threading.current_thread())
             #compile/get kernel:
-            key = self.src_format, self.dst_format
-            k = KERNELS_MAP.get(key)
-            assert k is not None, "kernel not found for %s" % str(key)
-            self.kernel_function_name, ksrc = k
-            debug("init_cuda(%s) compiling kernel %s: %s", device_id, self.kernel_function_name, ksrc)
-            mod = SourceModule(ksrc)
-            self.kernel_function = mod.get_function(self.kernel_function_name)
+            self.kernel_function_name, self.kernel_function = self.get_CUDA_kernel(self.device_id, self.src_format, self.dst_format)
 
             self.max_block_sizes = d.get_attribute(da.MAX_BLOCK_DIM_X), d.get_attribute(da.MAX_BLOCK_DIM_Y), d.get_attribute(da.MAX_BLOCK_DIM_Z)
             self.max_grid_sizes = d.get_attribute(da.MAX_GRID_DIM_X), d.get_attribute(da.MAX_GRID_DIM_Y), d.get_attribute(da.MAX_GRID_DIM_Z)

diff --git a/src/xpra/codecs/csc_opencl/colorspace_converter.py b/src/xpra/codecs/csc_opencl/colorspace_converter.py
@@ -25,13 +25,21 @@
 if len(opencl_platforms)==0:
     raise ImportError("no OpenCL platforms found!")
 
+def roundup(n, m):
+    return (n + m - 1) & ~(m - 1)
+
+def dimdiv(dim, div):
+    #when we divide a dimensions by the subsampling
+    #we want to round up so as to include the last
+    #pixel when we hit odd dimensions
+    return roundup(dim/div, div)
+
 def device_type(d):
     try:
         return pyopencl.device_type.to_string(d.type)
     except:
         return d.type
 
-
 def device_info(d):
     dtype = device_type(d)
     if hasattr(d, "opencl_c_version"):
@@ -44,7 +52,6 @@ def is_supported(platform_name):
     #FreeOCL and pocl do not work:
     return not platform_name.startswith("FreeOCL") and not platform_name.startswith("Portable Computing Language")
 
-
 def log_device_info(device):
     if not device:
         return
@@ -76,7 +83,7 @@ def log_version_info():
 selected_device = None
 selected_platform = None
 context = None
-def init_context():
+def select_device():
     global context, selected_device,selected_platform
     if context is not None:
         return
@@ -363,7 +370,7 @@ def build_kernels():
     global program
     if program is not None:
         return
-    init_context()
+    select_device()
     NAMES_TO_KERNELS = gen_kernels()
     with warnings.catch_warnings(record=True) as w:
         def dump_warnings(logfn):
@@ -385,20 +392,13 @@ def dump_warnings(logfn):
             raise ImportError("cannot build the OpenCL program: %s" % e)
 
 
-def roundup(n, m):
-    return (n + m - 1) & ~(m - 1)
-
-def dimdiv(dim, div):
-    #when we divide a dimensions by the subsampling
-    #we want to round up so as to include the last
-    #pixel when we hit odd dimensions
-    return roundup(dim/div, div)
-
-
 from xpra.codecs.image_wrapper import ImageWrapper
 from xpra.codecs.codec_constants import codec_spec, get_subsampling_divs
 
 
+def init_module():
+    build_kernels()
+
 def get_type():
     return "opencl"
 

diff --git a/src/xpra/codecs/csc_swscale/colorspace_converter.pyx b/src/xpra/codecs/csc_swscale/colorspace_converter.pyx
@@ -53,13 +53,6 @@ cdef extern from "libswscale/swscale.h":
                   uint8_t *const dst[], const int dstStride[]) nogil
 
 
-def get_type():
-    return "swscale"
-
-def get_version():
-    return get_swscale_version()
-
-
 cdef class CSCPixelFormat:
     cdef AVPixelFormat av_enum
     cdef char* av_enum_name
@@ -163,6 +156,16 @@ def get_swscale_flags_strs(int flags):
     return strs
 
 
+def init_module():
+    #nothing to do!
+    pass
+
+def get_type():
+    return "swscale"
+
+def get_version():
+    return get_swscale_version()
+
 def get_input_colorspaces():
     return COLORSPACES
 

diff --git a/src/xpra/codecs/video_helper.py b/src/xpra/codecs/video_helper.py
@@ -115,6 +115,12 @@ def init_csc_option(self, csc_name):
         debug("init_csc_option(%s) module=%s", csc_name, csc_module)
         if csc_module is None:
             return
+        csc_type = csc_module.get_type()
+        try:
+            csc_module.init_module()
+        except Exception, e:
+            log.warn("cannot use %s module %s: %s", csc_type, csc_module, e, exc_info=True)
+            return
         in_cscs = csc_module.get_input_colorspaces()
         for in_csc in in_cscs:
             csc_specs = self._csc_encoder_specs.setdefault(in_csc, [])