inducer · nkoskelo · Feb 20, 2025 · Feb 24, 2025 · Feb 24, 2025 · Feb 24, 2025
diff --git a/loopy/expression.py b/loopy/expression.py
@@ -162,9 +162,8 @@ def map_constant(self, expr: object) -> bool:
 
     def map_variable(self, expr: p.Variable) -> bool:
         if expr.name == self.vec_iname:
-            # Technically, this is doable. But we're not going there.
-            raise UnvectorizableError()
-
+            # Technically, this is doable.
+            return True
         # A single variable is always a scalar.
         return False
 

diff --git a/loopy/target/opencl.py b/loopy/target/opencl.py
@@ -46,6 +46,7 @@
 
     from loopy.codegen import CodeGenerationState
     from loopy.codegen.result import CodeGenerationResult
+    from loopy.kernel import LoopKernel
 
 
 # {{{ dtype registry wrappers
@@ -456,7 +457,8 @@ def get_opencl_callables():
 
 # {{{ symbol mangler
 
-def opencl_symbol_mangler(kernel, name):
+def opencl_symbol_mangler(kernel: LoopKernel,
+                          name: str) -> tuple[NumpyType, str] | None:
     # FIXME: should be more picky about exact names
     if name.startswith("FLT_"):
         return NumpyType(np.dtype(np.float32)), name
@@ -545,6 +547,21 @@ def wrap_in_typecast(self, actual_type, needed_dtype, s):
             from pymbolic.primitives import Comparison
             return Comparison(s, "!=", 0)
 
+        if needed_dtype == actual_type:
+            return s
+
+        registry = self.codegen_state.ast_builder.target.get_dtype_registry()
+        if self.codegen_state.target.is_vector_dtype(needed_dtype):
+            # OpenCL does not let you do explicit vector type casts between vector
+            # types. Instead you need to call their function which is of the form
+            # <desttype> convert_<desttype><n>(src) where n
+            # is the number of elements in the vector which is the same as in src.
+            # https://registry.khronos.org/OpenCL/specs/3.0-unified/html/OpenCL_C.html#explicit-casts
+            if self.codegen_state.target.is_vector_dtype(actual_type) or \
+                actual_type.dtype.kind == "b":
+                cast = var("convert_%s" % registry.dtype_to_ctype(needed_dtype))
+                return cast(s)
+
         return super().wrap_in_typecast(actual_type, needed_dtype, s)
 
     def map_group_hw_index(self, expr, type_context):
@@ -553,6 +570,69 @@ def map_group_hw_index(self, expr, type_context):
     def map_local_hw_index(self, expr, type_context):
         return var("lid")(expr.axis)
 
+    def map_variable(self, expr, type_context):
+
+        if self.codegen_state.vectorization_info:
+            if self.codegen_state.vectorization_info.iname == expr.name:
+                # This needs to be converted into a vector literal.
+                from loopy.symbolic import Literal
+                vector_length = self.codegen_state.vectorization_info.length
+                index_type = self.codegen_state.kernel.index_dtype
+                vector_type = self.codegen_state.target.vector_dtype(index_type,
+                                                                     vector_length)
+                typecast = self.codegen_state.target.dtype_to_typename(vector_type)
+                vector_literal = f"(({typecast})" + " (" + \
+                        ",".join([f"{i}" for i in range(vector_length)]) + "))"
+                return Literal(vector_literal)
+        return super().map_variable(expr, type_context)
+
+    def map_if(self, expr, type_context):
+        from loopy.types import to_loopy_type
+        result_type = self.infer_type(expr)
+        conditional_needed_loopy_type = to_loopy_type(np.bool_)
+        if self.codegen_state.vectorization_info:
+            from loopy.codegen import UnvectorizableError
+            from loopy.expression import VectorizabilityChecker
+            checker = VectorizabilityChecker(self.codegen_state.kernel,
+                                     self.codegen_state.vectorization_info.iname,
+                                     self.codegen_state.vectorization_info.length)
+
+            try:
+                is_vector = checker(expr)
+
+                if is_vector:
+                    """
+                    We could have a vector literal here which may need to be
+                    converted to an appropriate size. The OpenCL specification states
+                    that for ( c ? a : b) a, b, and c must have the same
+                    number of elements and bits and that c must be an integral type.
+                    https://registry.khronos.org/OpenCL/specs/3.0-unified/html/OpenCL_C.html#table-builtin-relational
+                    """
+                    index_type = to_loopy_type(self.codegen_state.kernel.index_dtype)
+                    types = {8: to_loopy_type(np.int64), 4: to_loopy_type(np.int32),
+                             2: to_loopy_type(np.int16), 1: to_loopy_type(np.int8)}
+                    length = self.codegen_state.vectorization_info.length
+                    if index_type.itemsize != result_type.itemsize and \
+                        result_type.itemsize in types.keys():
+                        # Need to convert index type into result type size.
+                        # Item size is measured in bytes.
+                        index_type = types[result_type.itemsize]
+                    elif index_type.itemsize * length != result_type.itemsize and \
+                        (result_type.itemsize // length) in types.keys():
+
+                        index_type = types[result_type.itemsize // length]
+                    vector_type = self.codegen_state.target.vector_dtype(index_type,
+                                                                         length)
+                    conditional_needed_loopy_type = to_loopy_type(vector_type)
+            except UnvectorizableError:
+                pass
+
+        return type(expr)(
+                self.rec(expr.condition, type_context,
+                         conditional_needed_loopy_type),
+                self.rec(expr.then, type_context, result_type),
+                self.rec(expr.else_, type_context, result_type),
+                )
 # }}}
 
 

diff --git a/test/test_target.py b/test/test_target.py
@@ -875,6 +875,36 @@ def test_float3():
     assert "float3" in device_code
 
 
+def test_cl_vectorize_index_variable(ctx_factory):
+    knl = lp.make_kernel(
+            "{ [i]: 0<=i<n }",
+            """
+            b[i] = a[i]*3 if i < 32 else sin(a[i])
+            """)
+
+    knl = lp.split_array_axis(knl, "a,b", 0, 4)
+    knl = lp.split_iname(knl, "i", 4)
+    knl = lp.tag_inames(knl, {"i_inner": "vec"})
+    knl = lp.tag_array_axes(knl, "a,b", "c,vec")
+    knl = lp.set_options(knl, write_code=True)
+    knl = lp.assume(knl, "n % 4 = 0 and n>0")
+
+    rng = np.random.default_rng(seed=12)
+    a = rng.normal(size=(16, 4))
+    ctx = ctx_factory()
+    queue = cl.CommandQueue(ctx)
+    knl = lp.add_and_infer_dtypes(knl, {"a": np.float64, "n": np.int64})
+    _evt, (result,) = knl(queue, a=a, n=a.size)
+
+    result_ref = np.zeros(a.shape, dtype=np.float64)
+    for i in range(16):
+        for j in range(4):
+            ind = i*4 + j
+            result_ref[i, j] = a[i, j] * 3 if ind < 32 else np.sin(a[i, j])
+
+    assert np.allclose(result, result_ref)
+
+
 if __name__ == "__main__":
     import sys
     if len(sys.argv) > 1: