Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Emit vector constant for vector index #921

Open
wants to merge 10 commits into
base: main
Choose a base branch
from
5 changes: 2 additions & 3 deletions loopy/expression.py
Original file line number Diff line number Diff line change
Expand Up @@ -162,9 +162,8 @@ def map_constant(self, expr: object) -> bool:

def map_variable(self, expr: p.Variable) -> bool:
if expr.name == self.vec_iname:
# Technically, this is doable. But we're not going there.
raise UnvectorizableError()

# Technically, this is doable.
return True
# A single variable is always a scalar.
return False

Expand Down
82 changes: 81 additions & 1 deletion loopy/target/opencl.py
Original file line number Diff line number Diff line change
Expand Up @@ -46,6 +46,7 @@

from loopy.codegen import CodeGenerationState
from loopy.codegen.result import CodeGenerationResult
from loopy.kernel import LoopKernel


# {{{ dtype registry wrappers
Expand Down Expand Up @@ -456,7 +457,8 @@ def get_opencl_callables():

# {{{ symbol mangler

def opencl_symbol_mangler(kernel, name):
def opencl_symbol_mangler(kernel: LoopKernel,
name: str) -> tuple[NumpyType, str] | None:
# FIXME: should be more picky about exact names
if name.startswith("FLT_"):
return NumpyType(np.dtype(np.float32)), name
Expand Down Expand Up @@ -545,6 +547,21 @@ def wrap_in_typecast(self, actual_type, needed_dtype, s):
from pymbolic.primitives import Comparison
return Comparison(s, "!=", 0)

if needed_dtype == actual_type:
return s

registry = self.codegen_state.ast_builder.target.get_dtype_registry()
if self.codegen_state.target.is_vector_dtype(needed_dtype):
# OpenCL does not let you do explicit vector type casts between vector
# types. Instead you need to call their function which is of the form
# <desttype> convert_<desttype><n>(src) where n
# is the number of elements in the vector which is the same as in src.
# https://registry.khronos.org/OpenCL/specs/3.0-unified/html/OpenCL_C.html#explicit-casts
if self.codegen_state.target.is_vector_dtype(actual_type) or \
actual_type.dtype.kind == "b":
cast = var("convert_%s" % registry.dtype_to_ctype(needed_dtype))
return cast(s)

return super().wrap_in_typecast(actual_type, needed_dtype, s)

def map_group_hw_index(self, expr, type_context):
Expand All @@ -553,6 +570,69 @@ def map_group_hw_index(self, expr, type_context):
def map_local_hw_index(self, expr, type_context):
return var("lid")(expr.axis)

def map_variable(self, expr, type_context):

if self.codegen_state.vectorization_info:
if self.codegen_state.vectorization_info.iname == expr.name:
# This needs to be converted into a vector literal.
from loopy.symbolic import Literal
vector_length = self.codegen_state.vectorization_info.length
index_type = self.codegen_state.kernel.index_dtype
vector_type = self.codegen_state.target.vector_dtype(index_type,
vector_length)
typecast = self.codegen_state.target.dtype_to_typename(vector_type)
vector_literal = f"(({typecast})" + " (" + \
",".join([f"{i}" for i in range(vector_length)]) + "))"
return Literal(vector_literal)
return super().map_variable(expr, type_context)

def map_if(self, expr, type_context):
from loopy.types import to_loopy_type
result_type = self.infer_type(expr)
conditional_needed_loopy_type = to_loopy_type(np.bool_)
if self.codegen_state.vectorization_info:
from loopy.codegen import UnvectorizableError
from loopy.expression import VectorizabilityChecker
checker = VectorizabilityChecker(self.codegen_state.kernel,
self.codegen_state.vectorization_info.iname,
self.codegen_state.vectorization_info.length)

try:
is_vector = checker(expr)

if is_vector:
"""
We could have a vector literal here which may need to be
converted to an appropriate size. The OpenCL specification states
that for ( c ? a : b) a, b, and c must have the same
number of elements and bits and that c must be an integral type.
https://registry.khronos.org/OpenCL/specs/3.0-unified/html/OpenCL_C.html#table-builtin-relational
"""
index_type = to_loopy_type(self.codegen_state.kernel.index_dtype)
types = {8: to_loopy_type(np.int64), 4: to_loopy_type(np.int32),
2: to_loopy_type(np.int16), 1: to_loopy_type(np.int8)}
length = self.codegen_state.vectorization_info.length
if index_type.itemsize != result_type.itemsize and \
result_type.itemsize in types.keys():
# Need to convert index type into result type size.
# Item size is measured in bytes.
index_type = types[result_type.itemsize]
elif index_type.itemsize * length != result_type.itemsize and \
(result_type.itemsize // length) in types.keys():

index_type = types[result_type.itemsize // length]
vector_type = self.codegen_state.target.vector_dtype(index_type,
length)
conditional_needed_loopy_type = to_loopy_type(vector_type)
except UnvectorizableError:
pass

return type(expr)(
self.rec(expr.condition, type_context,
conditional_needed_loopy_type),
self.rec(expr.then, type_context, result_type),
self.rec(expr.else_, type_context, result_type),
)
# }}}


Expand Down
30 changes: 30 additions & 0 deletions test/test_target.py
Original file line number Diff line number Diff line change
Expand Up @@ -875,6 +875,36 @@ def test_float3():
assert "float3" in device_code


def test_cl_vectorize_index_variable(ctx_factory):
knl = lp.make_kernel(
"{ [i]: 0<=i<n }",
"""
b[i] = a[i]*3 if i < 32 else sin(a[i])
""")

knl = lp.split_array_axis(knl, "a,b", 0, 4)
knl = lp.split_iname(knl, "i", 4)
knl = lp.tag_inames(knl, {"i_inner": "vec"})
knl = lp.tag_array_axes(knl, "a,b", "c,vec")
knl = lp.set_options(knl, write_code=True)
knl = lp.assume(knl, "n % 4 = 0 and n>0")

rng = np.random.default_rng(seed=12)
a = rng.normal(size=(16, 4))
ctx = ctx_factory()
queue = cl.CommandQueue(ctx)
knl = lp.add_and_infer_dtypes(knl, {"a": np.float64, "n": np.int64})
_evt, (result,) = knl(queue, a=a, n=a.size)

result_ref = np.zeros(a.shape, dtype=np.float64)
for i in range(16):
for j in range(4):
ind = i*4 + j
result_ref[i, j] = a[i, j] * 3 if ind < 32 else np.sin(a[i, j])

assert np.allclose(result, result_ref)


if __name__ == "__main__":
import sys
if len(sys.argv) > 1:
Expand Down
Loading