From af7ff8abd2588e79116faeeb212249c673a779a2 Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?P=C3=A9ter=20Kardos?= <kardospeter1994@hotmail.com>
Date: Tue, 24 Oct 2023 10:53:42 +0200
Subject: [PATCH] feature[next] GPU backend from Python (#1325)
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

* add support for gtfn cuda backend

* reconcile new code with type hints by relaxing type hints.

* add ADR and todos for library x builsys matrix design

* add cpu/gpu tox variants for next, update ci files

---------

Co-authored-by: Rico Häuselmann <ricoh@cscs.ch>
---
 .github/workflows/test-next.yml               |  10 +-
 ci/cscs-ci.yml                                |   1 +
 .../ADRs/0009-Compiled-Backend-Integration.md |   2 +-
 ...016-Multiple-Backends-and-Build-Systems.md | 118 ++++++++++++++++++
 src/gt4py/next/iterator/embedded.py           |   8 +-
 src/gt4py/next/otf/binding/nanobind.py        |  18 +--
 .../otf/compilation/build_systems/cmake.py    |  19 +--
 .../compilation/build_systems/cmake_lists.py  |  35 ++++--
 .../compilation/build_systems/compiledb.py    |  69 +++++-----
 src/gt4py/next/otf/compilation/compiler.py    |   2 +-
 src/gt4py/next/otf/languages.py               |  10 +-
 src/gt4py/next/otf/recipes.py                 |  18 +--
 src/gt4py/next/otf/step_types.py              |   5 +-
 .../codegens/gtfn/codegen.py                  |  15 +++
 .../codegens/gtfn/gtfn_module.py              | 107 +++++++++++++---
 .../otf_compile_executor.py                   |   8 +-
 .../runners/{gtfn_cpu.py => gtfn.py}          |  52 +++++---
 .../feature_tests/ffront_tests/__init__.py    |  13 ++
 .../ffront_tests/ffront_test_utils.py         |  10 +-
 .../ffront_tests/test_arg_call_interface.py   |  15 +--
 .../ffront_tests/test_execution.py            |  25 ++--
 .../ffront_tests/test_gpu_backend.py          |  43 +++++++
 .../ffront_tests/test_gt4py_builtins.py       |  11 +-
 .../ffront_tests/test_math_unary_builtins.py  |   8 +-
 .../iterator_tests/test_builtins.py           |   2 +-
 .../ffront_tests/test_icon_like_scan.py       |  14 +--
 .../iterator_tests/test_anton_toy.py          |   8 +-
 .../iterator_tests/test_fvm_nabla.py          |   3 +-
 .../iterator_tests/test_hdiff.py              |   8 +-
 .../iterator_tests/test_vertical_advection.py |  16 ++-
 .../test_with_toy_connectivity.py             |  10 +-
 .../otf_tests/test_gtfn_workflow.py           |   4 +-
 tests/next_tests/unit_tests/conftest.py       |  12 +-
 .../build_systems_tests/conftest.py           |   2 +-
 .../gtfn_tests/test_gtfn_module.py            |   4 +-
 tox.ini                                       |   8 +-
 36 files changed, 507 insertions(+), 206 deletions(-)
 create mode 100644 docs/development/ADRs/0016-Multiple-Backends-and-Build-Systems.md
 rename src/gt4py/next/program_processors/runners/{gtfn_cpu.py => gtfn.py} (76%)
 create mode 100644 tests/next_tests/integration_tests/feature_tests/ffront_tests/__init__.py
 create mode 100644 tests/next_tests/integration_tests/feature_tests/ffront_tests/test_gpu_backend.py

diff --git a/.github/workflows/test-next.yml b/.github/workflows/test-next.yml
index 5baeb6acef..52f8c25386 100644
--- a/.github/workflows/test-next.yml
+++ b/.github/workflows/test-next.yml
@@ -57,13 +57,13 @@ jobs:
       run: |
         pyversion=${{ matrix.python-version }}
         pyversion_no_dot=${pyversion//./}
-        tox run -e next-py${pyversion_no_dot}-${{ matrix.tox-env-factor }}
-    #     mv coverage.json coverage-py${{ matrix.python-version }}-${{ matrix.os }}-${{ matrix.tox-env-factor }}.json
+        tox run -e next-py${pyversion_no_dot}-${{ matrix.tox-env-factor }}-cpu
+    #     mv coverage.json coverage-py${{ matrix.python-version }}-${{ matrix.os }}-${{ matrix.tox-env-factor }}-cpu.json
     # - name: Upload coverage.json artifact
     #   uses: actions/upload-artifact@v3
     #   with:
-    #     name: coverage-py${{ matrix.python-version }}-${{ matrix.os }}-${{ matrix.tox-env-factor }}
-    #     path: coverage-py${{ matrix.python-version }}-${{ matrix.os }}-${{ matrix.tox-env-factor }}.json
+    #     name: coverage-py${{ matrix.python-version }}-${{ matrix.os }}-${{ matrix.tox-env-factor }}-cpu
+    #     path: coverage-py${{ matrix.python-version }}-${{ matrix.os }}-${{ matrix.tox-env-factor }}-cpu.json
     # - name: Gather info
     #   run: |
     #     echo ${{ github.ref_type }} >> info.txt
@@ -76,5 +76,5 @@ jobs:
     # - name: Upload info artifact
     #   uses: actions/upload-artifact@v3
     #   with:
-    #     name: info-py${{ matrix.python-version }}-${{ matrix.os }}-${{ matrix.tox-env-factor }}
+    #     name: info-py${{ matrix.python-version }}-${{ matrix.os }}-${{ matrix.tox-env-factor }}-cpu
     #     path: info.txt
diff --git a/ci/cscs-ci.yml b/ci/cscs-ci.yml
index 3dc38bcd97..971a3cfc35 100644
--- a/ci/cscs-ci.yml
+++ b/ci/cscs-ci.yml
@@ -117,3 +117,4 @@ test py310:
     - SUBPACKAGE: eve
     - SUBPACKAGE: next
       VARIANT: [-nomesh, -atlas]
+      SUBVARIANT: [-cuda11x, -cpu]
diff --git a/docs/development/ADRs/0009-Compiled-Backend-Integration.md b/docs/development/ADRs/0009-Compiled-Backend-Integration.md
index 273f954438..27c2f0c73c 100644
--- a/docs/development/ADRs/0009-Compiled-Backend-Integration.md
+++ b/docs/development/ADRs/0009-Compiled-Backend-Integration.md
@@ -159,7 +159,7 @@ Compiled backends may generate code which depends on libraries and tools written
 
 1. can be installed with `pip` (from `PyPI` or another source) automatically.
 2. can not be installed with `pip` and not commonly found on HPC machines.
-3. libraries and tools which are left to the user to install and make discoverable: `pybind11`, C++ compilers
+3. libraries and tools which are left to the user to install and make discoverable: `boost`, C++ compilers
 
 Category 1 are made dependencies of `GT4Py`. Examples include `pybind11`, `cmake`, `ninja`.
 
diff --git a/docs/development/ADRs/0016-Multiple-Backends-and-Build-Systems.md b/docs/development/ADRs/0016-Multiple-Backends-and-Build-Systems.md
new file mode 100644
index 0000000000..ac84903514
--- /dev/null
+++ b/docs/development/ADRs/0016-Multiple-Backends-and-Build-Systems.md
@@ -0,0 +1,118 @@
+---
+tags: [backend, gridtools, bindings, libraries, otf]
+---
+
+# Support for Multiple Backends, Build Systems and Libraries
+
+- **Status**: valid
+- **Authors**: Rico Häuselmann (@DropD)
+- **Created**: 2023-10-11
+- **Updated**: 2023-10-11
+
+In the process of enabling CUDA for the GTFN backend, we encountered a potential support matrix of build systems x target language libraries. The current design requires build systems about all the libraries they can be used with. We decided that the matrix is too small for now and to not revisit the existing design yet.
+
+## Context
+
+ADRs [0009](0009-Compiled_Backend_Integration.md), [0011](0011-On_The_Fly_Compilation.md) and [0012](0012-GridTools_Cpp_OTF_Steps.md) detail the design decisions around what is loosely referred as "gt4py.next backends". In summary the goals are:
+
+- extensibility
+  - adding backends should not require changing existing code
+  - adding / modifying backend modules like build systems / compilers should not be blocked by assumptions in other modules.
+- modularity
+  - increase the chance that two different backends (for example GTFN and another C++ backend) can share code.
+
+Therefore the concerns of generating code in the target language, generating python bindings in the target language and of building (compiling) the generated code are separated it code generator, bindings generator and compile step / build system. The compile step is written to be build system agnostic.
+
+There is one category that connects all these concerns: libraries written in the target language and used in generated / bindings code.
+
+Current design:
+
+```mermaid
+graph LR
+
+gtgen("GTFN code generator (C++/Cuda)") --> |GridTools::fn_naive| Compiler
+gtgen("GTFN code generator (C++/Cuda)") --> |GridTools::fn_gpu| Compiler
+nb("nanobind bindings generator") --> |nanobind| Compiler
+Compiler --> CMakeProject --> CMakeListsGenerator
+Compiler --> CompiledbProject --> CMakeListsGenerator
+```
+
+The current design contains two mappings:
+
+- library name -> CMake `find_package()` call
+- library name -> CMake target name
+
+and the gridtools cpu/gpu link targets are differentiated by internally separating between two fictitious "gridtools_cpu" and "gridtools_gpu" libraries.
+
+## concerns
+
+### Usage
+
+The "gridtools_cpu" and "gridtools_gpu" fake library names add to the learning curve for this part of the code. Reuse of the existing components might require this knowledge.
+
+### Scalability
+
+Adding a new backend using the existing build systems but relying on different libraries has to modify existing build system components (at the very least CMakeListsGenerator).
+
+### Separation of concerns
+
+It makes more sense to separate the concerns of how to generate a valid build system configuration and how to use a particular library in a particular build system than to mix the two.
+
+## Decision
+
+Currently the code overhead is in the tens of lines, and there are no concrete plans to add more compiled backends or different build systems. Therefore we decide to keep the current design for now but to redesign as soon as the matrix grows.
+To this end ToDo comments are added in the relevant places
+
+## Consequences
+
+Initial GTFN gpu support will not be blocked by design work.
+
+## Alternatives Considered
+
+### Push build system support to the LibraryDependency instance
+
+```
+#src/gt4py/next/otf/binding/interface.py
+
+...
+class LibraryDependency:
+    name: str
+    version: str
+    link_targets: list[str]
+    include_headers: list[str]
+```
+
+- Simple, choice is made at code generator level, where the knowledge should be
+- Interface might not suit every build system
+- Up to the implementer to make the logic for choosing reusable (or not)
+
+### Create additional data structures to properly separate concerns
+
+```
+class BuildSystemConfig:
+    device_type: core_defs.DeviceType
+    ...
+
+
+class LibraryAdaptor:
+    library: LibraryDependency
+    build_system: CMakeProject
+
+    def config_phase(self, config: BuildSystemConfig) -> str:
+        import gridtools_cpp
+        cmake_dir = gridtools_cpp.get_cmake_dir()
+
+        return f"find_package(... {cmake_dir} ... )"
+
+def build_phase(self, config: BuildSystemConfig) -> str:
+        return "" # header only library
+
+def link_phase(self, main_target_name: str, config: BuildSystemConfig) -> str:
+        return f"target_link_libraries({main_target_name} ...)"
+```
+
+- More general and fully extensible, adaptors can be added for any required library / build system combination without touching existing code (depending on the registering mechanism).
+- More likely to be reusable as choices are explicit and can be overridden separately by sub classing.
+- More design work required. Open questions:
+  - Design the interface to work with any build system
+  - How to register adaptors? entry points? global dictionary?
diff --git a/src/gt4py/next/iterator/embedded.py b/src/gt4py/next/iterator/embedded.py
index 0edea35cf5..3d159eaae7 100644
--- a/src/gt4py/next/iterator/embedded.py
+++ b/src/gt4py/next/iterator/embedded.py
@@ -685,7 +685,7 @@ def _single_vertical_idx(
     indices: NamedFieldIndices, column_axis: Tag, column_index: common.IntIndex
 ) -> NamedFieldIndices:
     transformed = {
-        axis: (index if axis != column_axis else index.start + column_index)  # type: ignore[union-attr] # trust me, `index` is range in case of `column_axis`
+        axis: (index if axis != column_axis else index.start + column_index)  # type: ignore[union-attr] # trust me, `index` is range in case of `column_axis` # fmt: off
         for axis, index in indices.items()
     }
     return transformed
@@ -1050,7 +1050,7 @@ def __gt_origin__(self) -> tuple[int, ...]:
         return (0,)
 
     @classmethod
-    def __gt_builtin_func__(func: Callable, /) -> NoReturn:  # type: ignore[override] # Signature incompatible with supertype
+    def __gt_builtin_func__(func: Callable, /) -> NoReturn:  # type: ignore[override] # Signature incompatible with supertype # fmt: off
         raise NotImplementedError()
 
     @property
@@ -1070,7 +1070,7 @@ def remap(self, index_field: common.Field) -> common.Field:
         raise NotImplementedError()
 
     def restrict(self, item: common.AnyIndexSpec) -> common.Field | core_defs.int32:
-        if common.is_absolute_index_sequence(item) and all(common.is_named_index(e) for e in item):  # type: ignore[arg-type] # we don't want to pollute the typing of `is_absolute_index_sequence` for this temporary code
+        if common.is_absolute_index_sequence(item) and all(common.is_named_index(e) for e in item):  # type: ignore[arg-type] # we don't want to pollute the typing of `is_absolute_index_sequence` for this temporary code # fmt: off
             d, r = item[0]
             assert d == self._dimension
             assert isinstance(r, int)
@@ -1156,7 +1156,7 @@ def __gt_origin__(self) -> tuple[int, ...]:
         return tuple()
 
     @classmethod
-    def __gt_builtin_func__(func: Callable, /) -> NoReturn:  # type: ignore[override] # Signature incompatible with supertype
+    def __gt_builtin_func__(func: Callable, /) -> NoReturn:  # type: ignore[override] # Signature incompatible with supertype # fmt: off
         raise NotImplementedError()
 
     @property
diff --git a/src/gt4py/next/otf/binding/nanobind.py b/src/gt4py/next/otf/binding/nanobind.py
index 9dccddc012..5d54512bd0 100644
--- a/src/gt4py/next/otf/binding/nanobind.py
+++ b/src/gt4py/next/otf/binding/nanobind.py
@@ -17,7 +17,7 @@
 
 from __future__ import annotations
 
-from typing import Any, Sequence, Union
+from typing import Any, Sequence, TypeVar, Union
 
 import gt4py.eve as eve
 from gt4py.eve.codegen import JinjaTemplate as as_jinja, TemplatedGenerator
@@ -26,6 +26,9 @@
 from gt4py.next.type_system import type_info as ti, type_specifications as ts
 
 
+SrcL = TypeVar("SrcL", bound=languages.NanobindSrcL, covariant=True)
+
+
 class Expr(eve.Node):
     pass
 
@@ -191,8 +194,8 @@ def make_argument(name: str, type_: ts.TypeSpec) -> str | BufferSID | CompositeS
 
 
 def create_bindings(
-    program_source: stages.ProgramSource[languages.Cpp, languages.LanguageWithHeaderFilesSettings],
-) -> stages.BindingSource[languages.Cpp, languages.Python]:
+    program_source: stages.ProgramSource[SrcL, languages.LanguageWithHeaderFilesSettings],
+) -> stages.BindingSource[SrcL, languages.Python]:
     """
     Generate Python bindings through which a C++ function can be called.
 
@@ -201,7 +204,7 @@ def create_bindings(
     program_source
         The program source for which the bindings are created
     """
-    if program_source.language is not languages.Cpp:
+    if program_source.language not in [languages.Cpp, languages.Cuda]:
         raise ValueError(
             f"Can only create bindings for C++ program sources, received {program_source.language}."
         )
@@ -221,7 +224,6 @@ def create_bindings(
             "gridtools/common/tuple_util.hpp",
             "gridtools/fn/unstructured.hpp",
             "gridtools/fn/cartesian.hpp",
-            "gridtools/fn/backend/naive.hpp",
             "gridtools/storage/adapter/nanobind_adapter.hpp",
         ],
         wrapper=WrapperFunction(
@@ -266,8 +268,6 @@ def create_bindings(
 
 @workflow.make_step
 def bind_source(
-    inp: stages.ProgramSource[languages.Cpp, languages.LanguageWithHeaderFilesSettings],
-) -> stages.CompilableSource[
-    languages.Cpp, languages.LanguageWithHeaderFilesSettings, languages.Python
-]:
+    inp: stages.ProgramSource[SrcL, languages.LanguageWithHeaderFilesSettings],
+) -> stages.CompilableSource[SrcL, languages.LanguageWithHeaderFilesSettings, languages.Python]:
     return stages.CompilableSource(program_source=inp, binding_source=create_bindings(inp))
diff --git a/src/gt4py/next/otf/compilation/build_systems/cmake.py b/src/gt4py/next/otf/compilation/build_systems/cmake.py
index b281fde7b5..3d36f5d985 100644
--- a/src/gt4py/next/otf/compilation/build_systems/cmake.py
+++ b/src/gt4py/next/otf/compilation/build_systems/cmake.py
@@ -38,7 +38,7 @@ def _generate_next_value_(name, start, count, last_values):
 @dataclasses.dataclass
 class CMakeFactory(
     compiler.BuildSystemProjectGenerator[
-        languages.Cpp, languages.LanguageWithHeaderFilesSettings, languages.Python
+        languages.Cpp | languages.Cuda, languages.LanguageWithHeaderFilesSettings, languages.Python
     ]
 ):
     """Create a CMakeProject from a ``CompilableSource`` stage object with given CMake settings."""
@@ -50,7 +50,7 @@ class CMakeFactory(
     def __call__(
         self,
         source: stages.CompilableSource[
-            languages.Cpp,
+            languages.Cpp | languages.Cuda,
             languages.LanguageWithHeaderFilesSettings,
             languages.Python,
         ],
@@ -63,16 +63,21 @@ def __call__(
         name = source.program_source.entry_point.name
         header_name = f"{name}.{source.program_source.language_settings.header_extension}"
         bindings_name = f"{name}_bindings.{source.program_source.language_settings.file_extension}"
+        cmake_languages = [cmake_lists.Language(name="CXX")]
+        if source.program_source.language is languages.Cuda:
+            cmake_languages = [*cmake_languages, cmake_lists.Language(name="CUDA")]
+        cmake_lists_src = cmake_lists.generate_cmakelists_source(
+            name,
+            source.library_deps,
+            [header_name, bindings_name],
+            languages=cmake_languages,
+        )
         return CMakeProject(
             root_path=cache.get_cache_folder(source, cache_strategy),
             source_files={
                 header_name: source.program_source.source_code,
                 bindings_name: source.binding_source.source_code,
-                "CMakeLists.txt": cmake_lists.generate_cmakelists_source(
-                    name,
-                    source.library_deps,
-                    [header_name, bindings_name],
-                ),
+                "CMakeLists.txt": cmake_lists_src,
             },
             program_name=name,
             generator_name=self.cmake_generator_name,
diff --git a/src/gt4py/next/otf/compilation/build_systems/cmake_lists.py b/src/gt4py/next/otf/compilation/build_systems/cmake_lists.py
index ef222341e3..5ea4ba0519 100644
--- a/src/gt4py/next/otf/compilation/build_systems/cmake_lists.py
+++ b/src/gt4py/next/otf/compilation/build_systems/cmake_lists.py
@@ -30,22 +30,31 @@ class LinkDependency(eve.Node):
     target: str
 
 
+class Language(eve.Node):
+    name: str
+
+
 class CMakeListsFile(eve.Node):
     project_name: str
     find_deps: Sequence[FindDependency]
     link_deps: Sequence[LinkDependency]
     source_names: Sequence[str]
     bin_output_suffix: str
+    languages: Sequence[Language]
 
 
 class CMakeListsGenerator(eve.codegen.TemplatedGenerator):
     CMakeListsFile = as_jinja(
         """
-        project({{project_name}})
         cmake_minimum_required(VERSION 3.20.0)
 
+        project({{project_name}})
+
         # Languages
-        enable_language(CXX)
+        if(NOT DEFINED CMAKE_CUDA_ARCHITECTURES)
+            set(CMAKE_CUDA_ARCHITECTURES 60)
+        endif()
+        {{"\\n".join(languages)}}
 
         # Paths
         list(APPEND CMAKE_MODULE_PATH ${CMAKE_BINARY_DIR})
@@ -77,18 +86,17 @@ class CMakeListsGenerator(eve.codegen.TemplatedGenerator):
     )
 
     def visit_FindDependency(self, dep: FindDependency):
+        # TODO(ricoh): do not add more libraries here
+        #   and do not use this design in a new build system.
+        #   Instead, design this to be extensible (refer to ADR-0016).
         match dep.name:
-            case "pybind11":
-                import pybind11
-
-                return f"find_package(pybind11 CONFIG REQUIRED PATHS {pybind11.get_cmake_dir()} NO_DEFAULT_PATH)"
             case "nanobind":
                 import nanobind
 
                 py = "find_package(Python COMPONENTS Interpreter Development REQUIRED)"
                 nb = f"find_package(nanobind CONFIG REQUIRED PATHS {nanobind.cmake_dir()} NO_DEFAULT_PATHS)"
                 return py + "\n" + nb
-            case "gridtools":
+            case "gridtools_cpu" | "gridtools_gpu":
                 import gridtools_cpp
 
                 return f"find_package(GridTools REQUIRED PATHS {gridtools_cpp.get_cmake_dir()} NO_DEFAULT_PATH)"
@@ -96,13 +104,16 @@ def visit_FindDependency(self, dep: FindDependency):
                 raise ValueError("Library {name} is not supported".format(name=dep.name))
 
     def visit_LinkDependency(self, dep: LinkDependency):
+        # TODO(ricoh): do not add more libraries here
+        #   and do not use this design in a new build system.
+        #   Instead, design this to be extensible (refer to ADR-0016).
         match dep.name:
-            case "pybind11":
-                lib_name = "pybind11::module"
             case "nanobind":
                 lib_name = "nanobind-static"
-            case "gridtools":
+            case "gridtools_cpu":
                 lib_name = "GridTools::fn_naive"
+            case "gridtools_gpu":
+                lib_name = "GridTools::fn_gpu"
             case _:
                 raise ValueError("Library {name} is not supported".format(name=dep.name))
 
@@ -118,11 +129,14 @@ def visit_LinkDependency(self, dep: LinkDependency):
         lnk = f"target_link_libraries({dep.target} PUBLIC {lib_name})"
         return cfg + "\n" + lnk
 
+    Language = as_jinja("enable_language({{name}})")
+
 
 def generate_cmakelists_source(
     project_name: str,
     dependencies: tuple[interface.LibraryDependency, ...],
     source_names: Sequence[str],
+    languages: Sequence[Language] = (Language(name="CXX"),),
 ) -> str:
     """
     Generate CMakeLists file contents.
@@ -135,5 +149,6 @@ def generate_cmakelists_source(
         link_deps=[LinkDependency(name=d.name, target=project_name) for d in dependencies],
         source_names=source_names,
         bin_output_suffix=common.python_module_suffix(),
+        languages=languages,
     )
     return CMakeListsGenerator.apply(cmakelists_file)
diff --git a/src/gt4py/next/otf/compilation/build_systems/compiledb.py b/src/gt4py/next/otf/compilation/build_systems/compiledb.py
index 34f2f85081..84a69859c0 100644
--- a/src/gt4py/next/otf/compilation/build_systems/compiledb.py
+++ b/src/gt4py/next/otf/compilation/build_systems/compiledb.py
@@ -20,7 +20,7 @@
 import re
 import shutil
 import subprocess
-from typing import Optional
+from typing import Optional, TypeVar
 
 from gt4py.next.otf import languages, stages
 from gt4py.next.otf.binding import interface
@@ -28,10 +28,13 @@
 from gt4py.next.otf.compilation.build_systems import cmake, cmake_lists
 
 
+SrcL = TypeVar("SrcL", bound=languages.NanobindSrcL)
+
+
 @dataclasses.dataclass
 class CompiledbFactory(
     compiler.BuildSystemProjectGenerator[
-        languages.Cpp, languages.LanguageWithHeaderFilesSettings, languages.Python
+        SrcL, languages.LanguageWithHeaderFilesSettings, languages.Python
     ]
 ):
     """
@@ -48,7 +51,7 @@ class CompiledbFactory(
     def __call__(
         self,
         source: stages.CompilableSource[
-            languages.Cpp,
+            SrcL,
             languages.LanguageWithHeaderFilesSettings,
             languages.Python,
         ],
@@ -66,6 +69,8 @@ def __call__(
             deps=source.library_deps,
             build_type=self.cmake_build_type,
             cmake_flags=self.cmake_extra_flags or [],
+            language=source.program_source.language,
+            language_settings=source.program_source.language_settings,
         )
 
         if self.renew_compiledb or not (
@@ -92,9 +97,7 @@ def __call__(
 
 @dataclasses.dataclass()
 class CompiledbProject(
-    stages.BuildSystemProject[
-        languages.Cpp, languages.LanguageWithHeaderFilesSettings, languages.Python
-    ]
+    stages.BuildSystemProject[SrcL, languages.LanguageWithHeaderFilesSettings, languages.Python]
 ):
     """
     Compiledb build system for gt4py programs.
@@ -113,18 +116,21 @@ class CompiledbProject(
     compile_commands_cache: pathlib.Path
     bindings_file_name: str
 
-    def build(self):
+    def build(self) -> None:
         self._write_files()
-        if build_data.read_data(self.root_path).status < build_data.BuildStatus.CONFIGURED:
+        current_data = build_data.read_data(self.root_path)
+        if current_data is None or current_data.status < build_data.BuildStatus.CONFIGURED:
             self._run_config()
+            current_data = build_data.read_data(self.root_path)  # update after config
         if (
-            build_data.BuildStatus.CONFIGURED
-            <= build_data.read_data(self.root_path).status
+            current_data is not None
+            and build_data.BuildStatus.CONFIGURED
+            <= current_data.status
             < build_data.BuildStatus.COMPILED
         ):
             self._run_build()
 
-    def _write_files(self):
+    def _write_files(self) -> None:
         def ignore_not_libraries(folder: str, children: list[str]) -> list[str]:
             pattern = r"((lib.*\.a)|(.*\.lib))"
             libraries = [child for child in children if re.match(pattern, child)]
@@ -151,7 +157,7 @@ def ignore_not_libraries(folder: str, children: list[str]) -> list[str]:
             path=self.root_path,
         )
 
-    def _run_config(self):
+    def _run_config(self) -> None:
         compile_db = json.loads(self.compile_commands_cache.read_text())
 
         (self.root_path / "build").mkdir(exist_ok=True)
@@ -176,7 +182,7 @@ def _run_config(self):
             self.root_path,
         )
 
-    def _run_build(self):
+    def _run_build(self) -> None:
         logfile = self.root_path / "log_build.txt"
         compile_db = json.loads((self.root_path / "compile_commands.json").read_text())
         assert compile_db
@@ -212,19 +218,16 @@ def _cc_prototype_program_source(
     deps: tuple[interface.LibraryDependency, ...],
     build_type: cmake.BuildType,
     cmake_flags: list[str],
+    language: type[SrcL],
+    language_settings: languages.LanguageWithHeaderFilesSettings,
 ) -> stages.ProgramSource:
     name = _cc_prototype_program_name(deps, build_type.value, cmake_flags)
     return stages.ProgramSource(
         entry_point=interface.Function(name=name, parameters=()),
         source_code="",
         library_deps=deps,
-        language=languages.Cpp,
-        language_settings=languages.LanguageWithHeaderFilesSettings(
-            formatter_key="",
-            formatter_style=None,
-            file_extension="",
-            header_extension="",
-        ),
+        language=language,
+        language_settings=language_settings,
     )
 
 
@@ -251,16 +254,26 @@ def _cc_create_compiledb(
         stages.CompilableSource(prototype_program_source, None), cache_strategy
     )
 
+    header_ext = prototype_program_source.language_settings.header_extension
+    src_ext = prototype_program_source.language_settings.file_extension
+    prog_src_name = f"{name}.{header_ext}"
+    binding_src_name = f"{name}.{src_ext}"
+    cmake_languages = [cmake_lists.Language(name="CXX")]
+    if prototype_program_source.language is languages.Cuda:
+        cmake_languages = [*cmake_languages, cmake_lists.Language(name="CUDA")]
+
     prototype_project = cmake.CMakeProject(
         generator_name="Ninja",
         build_type=build_type,
         extra_cmake_flags=cmake_flags,
         root_path=cache_path,
         source_files={
-            f"{name}.hpp": "",
-            f"{name}.cpp": "",
+            **{name: "" for name in [binding_src_name, prog_src_name]},
             "CMakeLists.txt": cmake_lists.generate_cmakelists_source(
-                name, prototype_program_source.library_deps, [f"{name}.hpp", f"{name}.cpp"]
+                name,
+                prototype_program_source.library_deps,
+                [binding_src_name, prog_src_name],
+                cmake_languages,
             ),
         },
         program_name=name,
@@ -290,21 +303,21 @@ def _cc_create_compiledb(
             entry["command"]
             .replace(f"CMakeFiles/{name}.dir", ".")
             .replace(str(cache_path), "$SRC_PATH")
-            .replace(f"{name}.cpp", "$BINDINGS_FILE")
-            .replace(f"{name}", "$NAME")
+            .replace(binding_src_name, "$BINDINGS_FILE")
+            .replace(name, "$NAME")
             .replace("-I$SRC_PATH/build/_deps", f"-I{cache_path}/build/_deps")
         )
         entry["file"] = (
             entry["file"]
             .replace(f"CMakeFiles/{name}.dir", ".")
             .replace(str(cache_path), "$SRC_PATH")
-            .replace(f"{name}.cpp", "$BINDINGS_FILE")
+            .replace(binding_src_name, "$BINDINGS_FILE")
         )
         entry["output"] = (
             entry["output"]
             .replace(f"CMakeFiles/{name}.dir", ".")
-            .replace(f"{name}.cpp", "$BINDINGS_FILE")
-            .replace(f"{name}", "$NAME")
+            .replace(binding_src_name, "$BINDINGS_FILE")
+            .replace(name, "$NAME")
         )
 
     compile_db_path = cache_path / "compile_commands.json"
diff --git a/src/gt4py/next/otf/compilation/compiler.py b/src/gt4py/next/otf/compilation/compiler.py
index 32c5469333..dacb444207 100644
--- a/src/gt4py/next/otf/compilation/compiler.py
+++ b/src/gt4py/next/otf/compilation/compiler.py
@@ -23,7 +23,7 @@
 from gt4py.next.otf.step_types import LS, SrcL, TgtL
 
 
-SourceLanguageType = TypeVar("SourceLanguageType", bound=languages.LanguageTag)
+SourceLanguageType = TypeVar("SourceLanguageType", bound=languages.NanobindSrcL)
 LanguageSettingsType = TypeVar("LanguageSettingsType", bound=languages.LanguageSettings)
 T = TypeVar("T")
 
diff --git a/src/gt4py/next/otf/languages.py b/src/gt4py/next/otf/languages.py
index e2738615ac..b0d01d91ab 100644
--- a/src/gt4py/next/otf/languages.py
+++ b/src/gt4py/next/otf/languages.py
@@ -57,6 +57,14 @@ class Python(LanguageTag):
     ...
 
 
-class Cpp(LanguageTag):
+class NanobindSrcL(LanguageTag):
+    ...
+
+
+class Cpp(NanobindSrcL):
     settings_class = LanguageWithHeaderFilesSettings
     ...
+
+
+class Cuda(NanobindSrcL):
+    settings_class = LanguageWithHeaderFilesSettings
diff --git a/src/gt4py/next/otf/recipes.py b/src/gt4py/next/otf/recipes.py
index d144533798..4c6cdc273d 100644
--- a/src/gt4py/next/otf/recipes.py
+++ b/src/gt4py/next/otf/recipes.py
@@ -14,27 +14,21 @@
 from __future__ import annotations
 
 import dataclasses
-from typing import Generic, TypeVar
 
-from gt4py.next.otf import languages, stages, step_types, workflow
-
-
-SrcL = TypeVar("SrcL", bound=languages.LanguageTag)
-TgtL = TypeVar("TgtL", bound=languages.LanguageTag)
-LS = TypeVar("LS", bound=languages.LanguageSettings)
+from gt4py.next.otf import stages, step_types, workflow
 
 
 @dataclasses.dataclass(frozen=True)
-class OTFCompileWorkflow(workflow.NamedStepSequence, Generic[SrcL, LS, TgtL]):
+class OTFCompileWorkflow(workflow.NamedStepSequence):
     """The typical compiled backend steps composed into a workflow."""
 
-    translation: step_types.TranslationStep[SrcL, LS]
+    translation: step_types.TranslationStep
     bindings: workflow.Workflow[
-        stages.ProgramSource[SrcL, LS],
-        stages.CompilableSource[SrcL, LS, TgtL],
+        stages.ProgramSource,
+        stages.CompilableSource,
     ]
     compilation: workflow.Workflow[
-        stages.CompilableSource[SrcL, LS, TgtL],
+        stages.CompilableSource,
         stages.CompiledProgram,
     ]
     decoration: workflow.Workflow[stages.CompiledProgram, stages.CompiledProgram]
diff --git a/src/gt4py/next/otf/step_types.py b/src/gt4py/next/otf/step_types.py
index 54fe2e5389..5eeb5c495b 100644
--- a/src/gt4py/next/otf/step_types.py
+++ b/src/gt4py/next/otf/step_types.py
@@ -50,7 +50,10 @@ def __call__(
         ...
 
 
-class CompilationStep(Protocol[SrcL, LS, TgtL]):
+class CompilationStep(
+    workflow.Workflow[stages.CompilableSource[SrcL, LS, TgtL], stages.CompiledProgram],
+    Protocol[SrcL, LS, TgtL],
+):
     """Compile program source code and bindings into a python callable (CompilableSource -> CompiledProgram)."""
 
     def __call__(self, source: stages.CompilableSource[SrcL, LS, TgtL]) -> stages.CompiledProgram:
diff --git a/src/gt4py/next/program_processors/codegens/gtfn/codegen.py b/src/gt4py/next/program_processors/codegens/gtfn/codegen.py
index 8cd910e40f..645d1f742f 100644
--- a/src/gt4py/next/program_processors/codegens/gtfn/codegen.py
+++ b/src/gt4py/next/program_processors/codegens/gtfn/codegen.py
@@ -220,6 +220,7 @@ def visit_FencilDefinition(
         return self.generic_visit(
             node,
             grid_type_str=self._grid_type_str[node.grid_type],
+            block_sizes=self._block_sizes(node.offset_definitions),
             **kwargs,
         )
 
@@ -261,6 +262,8 @@ def visit_TemporaryAllocation(self, node, **kwargs):
     ${'\\n'.join(offset_definitions)}
     ${'\\n'.join(function_definitions)}
 
+    ${block_sizes}
+
     inline auto ${id} = [](auto... connectivities__){
         return [connectivities__...](auto backend, ${','.join('auto&& ' + p for p in params)}){
             auto tmp_alloc__ = gtfn::backend::tmp_allocator(backend);
@@ -273,6 +276,18 @@ def visit_TemporaryAllocation(self, node, **kwargs):
     """
     )
 
+    def _block_sizes(self, offset_definitions: list[gtfn_ir.TagDefinition]) -> str:
+        block_dims = []
+        block_sizes = [32, 8] + [1] * (len(offset_definitions) - 2)
+        for i, tag in enumerate(offset_definitions):
+            if tag.alias is None:
+                block_dims.append(
+                    f"gridtools::meta::list<{tag.name.id}_t, "
+                    f"gridtools::integral_constant<int, {block_sizes[i]}>>"
+                )
+        sizes_str = ",\n".join(block_dims)
+        return f"using block_sizes_t = gridtools::meta::list<{sizes_str}>;"
+
     @classmethod
     def apply(cls, root: Any, **kwargs: Any) -> str:
         generated_code = super().apply(root, **kwargs)
diff --git a/src/gt4py/next/program_processors/codegens/gtfn/gtfn_module.py b/src/gt4py/next/program_processors/codegens/gtfn/gtfn_module.py
index 5e24e855b5..7bf310f4e1 100644
--- a/src/gt4py/next/program_processors/codegens/gtfn/gtfn_module.py
+++ b/src/gt4py/next/program_processors/codegens/gtfn/gtfn_module.py
@@ -16,10 +16,11 @@
 
 import dataclasses
 import warnings
-from typing import Any, Final, Optional, TypeVar
+from typing import Any, Final, Optional
 
 import numpy as np
 
+from gt4py._core import definitions as core_defs
 from gt4py.eve import trees, utils
 from gt4py.next import common
 from gt4py.next.common import Connectivity, Dimension
@@ -32,8 +33,6 @@
 from gt4py.next.type_system import type_specifications as ts, type_translation
 
 
-T = TypeVar("T")
-
 GENERATED_CONNECTIVITY_PARAM_PREFIX = "gt_conn_"
 
 
@@ -45,14 +44,30 @@ def get_param_description(name: str, obj: Any) -> interface.Parameter:
 class GTFNTranslationStep(
     workflow.ChainableWorkflowMixin[
         stages.ProgramCall,
-        stages.ProgramSource[languages.Cpp, languages.LanguageWithHeaderFilesSettings],
+        stages.ProgramSource[languages.NanobindSrcL, languages.LanguageWithHeaderFilesSettings],
     ],
-    step_types.TranslationStep[languages.Cpp, languages.LanguageWithHeaderFilesSettings],
+    step_types.TranslationStep[languages.NanobindSrcL, languages.LanguageWithHeaderFilesSettings],
 ):
-    language_settings: languages.LanguageWithHeaderFilesSettings = cpp_interface.CPP_DEFAULT
-    enable_itir_transforms: bool = True  # TODO replace by more general mechanism, see https://github.com/GridTools/gt4py/issues/1135
+    language_settings: Optional[languages.LanguageWithHeaderFilesSettings] = None
+    # TODO replace by more general mechanism, see https://github.com/GridTools/gt4py/issues/1135
+    enable_itir_transforms: bool = True
     use_imperative_backend: bool = False
     lift_mode: Optional[LiftMode] = None
+    device_type: core_defs.DeviceType = core_defs.DeviceType.CPU
+
+    def _default_language_settings(self) -> languages.LanguageWithHeaderFilesSettings:
+        match self.device_type:
+            case core_defs.DeviceType.CUDA:
+                return languages.LanguageWithHeaderFilesSettings(
+                    formatter_key=cpp_interface.CPP_DEFAULT.formatter_key,
+                    formatter_style=cpp_interface.CPP_DEFAULT.formatter_style,
+                    file_extension="cu",
+                    header_extension="cuh",
+                )
+            case core_defs.DeviceType.CPU:
+                return cpp_interface.CPP_DEFAULT
+            case _:
+                raise self._not_implemented_for_device_type()
 
     def _process_regular_arguments(
         self,
@@ -98,7 +113,7 @@ def _process_regular_arguments(
                         isinstance(
                             dim, fbuiltins.FieldOffset
                         )  # TODO(havogt): remove support for FieldOffset as Dimension
-                        or dim.kind == common.DimensionKind.LOCAL
+                        or dim.kind is common.DimensionKind.LOCAL
                     ):
                         # translate sparse dimensions to tuple dtype
                         dim_name = dim.value
@@ -159,7 +174,7 @@ def _process_connectivity_args(
     def __call__(
         self,
         inp: stages.ProgramCall,
-    ) -> stages.ProgramSource[languages.Cpp, languages.LanguageWithHeaderFilesSettings]:
+    ) -> stages.ProgramSource[languages.NanobindSrcL, languages.LanguageWithHeaderFilesSettings]:
         """Generate GTFN C++ code from the ITIR definition."""
         program: itir.FencilDefinition = inp.program
 
@@ -189,7 +204,8 @@ def __call__(
 
         # combine into a format that is aligned with what the backend expects
         parameters: list[interface.Parameter] = regular_parameters + connectivity_parameters
-        args_expr: list[str] = ["gridtools::fn::backend::naive{}", *regular_args_expr]
+        backend_arg = self._backend_type()
+        args_expr: list[str] = [backend_arg, *regular_args_expr]
 
         function = interface.Function(program.id, tuple(parameters))
         decl_body = (
@@ -205,9 +221,9 @@ def __call__(
             **inp.kwargs,
         )
         source_code = interface.format_source(
-            self.language_settings,
+            self._language_settings(),
             f"""
-                    #include <gridtools/fn/backend/naive.hpp>
+                    #include <{self._backend_header()}>
                     #include <gridtools/stencil/global_parameter.hpp>
                     #include <gridtools/sid/dimension_to_tuple_like.hpp>
                     {stencil_src}
@@ -215,16 +231,69 @@ def __call__(
                     """.strip(),
         )
 
-        module = stages.ProgramSource(
+        module: stages.ProgramSource[
+            languages.NanobindSrcL, languages.LanguageWithHeaderFilesSettings
+        ] = stages.ProgramSource(
             entry_point=function,
-            library_deps=(interface.LibraryDependency("gridtools", "master"),),
+            library_deps=(interface.LibraryDependency(self._library_name(), "master"),),
             source_code=source_code,
-            language=languages.Cpp,
-            language_settings=self.language_settings,
+            language=self._language(),
+            language_settings=self._language_settings(),
         )
         return module
 
+    def _backend_header(self) -> str:
+        match self.device_type:
+            case core_defs.DeviceType.CUDA:
+                return "gridtools/fn/backend/gpu.hpp"
+            case core_defs.DeviceType.CPU:
+                return "gridtools/fn/backend/naive.hpp"
+            case _:
+                raise self._not_implemented_for_device_type()
+
+    def _backend_type(self) -> str:
+        match self.device_type:
+            case core_defs.DeviceType.CUDA:
+                return "gridtools::fn::backend::gpu<generated::block_sizes_t>{}"
+            case core_defs.DeviceType.CPU:
+                return "gridtools::fn::backend::naive{}"
+            case _:
+                raise self._not_implemented_for_device_type()
+
+    def _language(self) -> type[languages.NanobindSrcL]:
+        match self.device_type:
+            case core_defs.DeviceType.CUDA:
+                return languages.Cuda
+            case core_defs.DeviceType.CPU:
+                return languages.Cpp
+            case _:
+                raise self._not_implemented_for_device_type()
+
+    def _language_settings(self) -> languages.LanguageWithHeaderFilesSettings:
+        return (
+            self.language_settings
+            if self.language_settings is not None
+            else self._default_language_settings()
+        )
+
+    def _library_name(self) -> str:
+        match self.device_type:
+            case core_defs.DeviceType.CUDA:
+                return "gridtools_gpu"
+            case core_defs.DeviceType.CPU:
+                return "gridtools_cpu"
+            case _:
+                raise self._not_implemented_for_device_type()
+
+    def _not_implemented_for_device_type(self) -> NotImplementedError:
+        return NotImplementedError(
+            f"{self.__class__.__name__} is not implemented for "
+            f"device type {self.device_type.name}"
+        )
+
+
+translate_program_cpu: Final[step_types.TranslationStep] = GTFNTranslationStep()
 
-translate_program: Final[
-    step_types.TranslationStep[languages.Cpp, languages.LanguageWithHeaderFilesSettings]
-] = GTFNTranslationStep()
+translate_program_gpu: Final[step_types.TranslationStep] = GTFNTranslationStep(
+    device_type=core_defs.DeviceType.CUDA
+)
diff --git a/src/gt4py/next/program_processors/otf_compile_executor.py b/src/gt4py/next/program_processors/otf_compile_executor.py
index a22028414b..cd08c16933 100644
--- a/src/gt4py/next/program_processors/otf_compile_executor.py
+++ b/src/gt4py/next/program_processors/otf_compile_executor.py
@@ -20,15 +20,15 @@
 from gt4py.next.program_processors import processor_interface as ppi
 
 
-SrcL = TypeVar("SrcL", bound=languages.LanguageTag)
+SrcL = TypeVar("SrcL", bound=languages.NanobindSrcL)
 TgtL = TypeVar("TgtL", bound=languages.LanguageTag)
 LS = TypeVar("LS", bound=languages.LanguageSettings)
 HashT = TypeVar("HashT")
 
 
 @dataclasses.dataclass(frozen=True)
-class OTFCompileExecutor(ppi.ProgramExecutor, Generic[SrcL, LS, TgtL, HashT]):
-    otf_workflow: recipes.OTFCompileWorkflow[SrcL, LS, TgtL]
+class OTFCompileExecutor(ppi.ProgramExecutor):
+    otf_workflow: recipes.OTFCompileWorkflow
     name: Optional[str] = None
 
     def __call__(self, program: itir.FencilDefinition, *args, **kwargs: Any) -> None:
@@ -42,7 +42,7 @@ def __name__(self) -> str:
 
 
 @dataclasses.dataclass(frozen=True)
-class CachedOTFCompileExecutor(ppi.ProgramExecutor, Generic[SrcL, LS, TgtL, HashT]):
+class CachedOTFCompileExecutor(ppi.ProgramExecutor, Generic[HashT]):
     otf_workflow: workflow.CachedStep[stages.ProgramCall, stages.CompiledProgram, HashT]
     name: Optional[str] = None
 
diff --git a/src/gt4py/next/program_processors/runners/gtfn_cpu.py b/src/gt4py/next/program_processors/runners/gtfn.py
similarity index 76%
rename from src/gt4py/next/program_processors/runners/gtfn_cpu.py
rename to src/gt4py/next/program_processors/runners/gtfn.py
index 31b8323474..35c10fe353 100644
--- a/src/gt4py/next/program_processors/runners/gtfn_cpu.py
+++ b/src/gt4py/next/program_processors/runners/gtfn.py
@@ -16,11 +16,12 @@
 
 import numpy.typing as npt
 
+from gt4py._core import definitions as core_defs
 from gt4py.eve.utils import content_hash
 from gt4py.next import common
 from gt4py.next.iterator.transforms import LiftMode
-from gt4py.next.otf import languages, recipes, stages, workflow
-from gt4py.next.otf.binding import cpp_interface, nanobind
+from gt4py.next.otf import languages, recipes, stages, step_types, workflow
+from gt4py.next.otf.binding import nanobind
 from gt4py.next.otf.compilation import cache, compiler
 from gt4py.next.otf.compilation.build_systems import compiledb
 from gt4py.next.program_processors import otf_compile_executor
@@ -91,11 +92,23 @@ def compilation_hash(otf_closure: stages.ProgramCall) -> int:
     )
 
 
-GTFN_DEFAULT_TRANSLATION_STEP = gtfn_module.GTFNTranslationStep(
-    cpp_interface.CPP_DEFAULT, enable_itir_transforms=True, use_imperative_backend=False
+GTFN_DEFAULT_TRANSLATION_STEP: step_types.TranslationStep[
+    languages.NanobindSrcL, languages.LanguageWithHeaderFilesSettings
+] = gtfn_module.GTFNTranslationStep(
+    enable_itir_transforms=True,
+    use_imperative_backend=False,
+    device_type=core_defs.DeviceType.CPU,
 )
 
-GTFN_DEFAULT_COMPILE_STEP = compiler.Compiler(
+GTFN_GPU_TRANSLATION_STEP: step_types.TranslationStep[
+    languages.NanobindSrcL, languages.LanguageWithHeaderFilesSettings
+] = gtfn_module.GTFNTranslationStep(
+    enable_itir_transforms=True,
+    use_imperative_backend=False,
+    device_type=core_defs.DeviceType.CUDA,
+)
+
+GTFN_DEFAULT_COMPILE_STEP: step_types.CompilationStep = compiler.Compiler(
     cache_strategy=cache.Strategy.SESSION, builder_factory=compiledb.CompiledbFactory()
 )
 
@@ -108,30 +121,35 @@ def compilation_hash(otf_closure: stages.ProgramCall) -> int:
 )
 
 
-run_gtfn = otf_compile_executor.OTFCompileExecutor[
-    languages.Cpp, languages.LanguageWithHeaderFilesSettings, languages.Python, Any
-](name="run_gtfn", otf_workflow=GTFN_DEFAULT_WORKFLOW)
+GTFN_GPU_WORKFLOW = recipes.OTFCompileWorkflow(
+    translation=GTFN_GPU_TRANSLATION_STEP,
+    bindings=nanobind.bind_source,
+    compilation=GTFN_DEFAULT_COMPILE_STEP,
+    decoration=convert_args,
+)
+
+
+run_gtfn = otf_compile_executor.OTFCompileExecutor(
+    name="run_gtfn", otf_workflow=GTFN_DEFAULT_WORKFLOW
+)
 
-run_gtfn_imperative = otf_compile_executor.OTFCompileExecutor[
-    languages.Cpp, languages.LanguageWithHeaderFilesSettings, languages.Python, Any
-](
+run_gtfn_imperative = otf_compile_executor.OTFCompileExecutor(
     name="run_gtfn_imperative",
     otf_workflow=run_gtfn.otf_workflow.replace(
         translation=run_gtfn.otf_workflow.translation.replace(use_imperative_backend=True),
     ),
 )
 
-run_gtfn_cached = otf_compile_executor.CachedOTFCompileExecutor[
-    languages.Cpp, languages.LanguageWithHeaderFilesSettings, languages.Python, Any
-](
+run_gtfn_cached = otf_compile_executor.CachedOTFCompileExecutor(
     name="run_gtfn_cached",
     otf_workflow=workflow.CachedStep(step=run_gtfn.otf_workflow, hash_function=compilation_hash),
 )  # todo(ricoh): add API for converting an executor to a cached version of itself and vice versa
 
+run_gtfn_gpu = otf_compile_executor.OTFCompileExecutor(
+    name="run_gtfn_gpu", otf_workflow=GTFN_GPU_WORKFLOW
+)
 
-run_gtfn_with_temporaries = otf_compile_executor.OTFCompileExecutor[
-    languages.Cpp, languages.LanguageWithHeaderFilesSettings, languages.Python, Any
-](
+run_gtfn_with_temporaries = otf_compile_executor.OTFCompileExecutor(
     name="run_gtfn_with_temporaries",
     otf_workflow=run_gtfn.otf_workflow.replace(
         translation=run_gtfn.otf_workflow.translation.replace(lift_mode=LiftMode.FORCE_TEMPORARIES),
diff --git a/tests/next_tests/integration_tests/feature_tests/ffront_tests/__init__.py b/tests/next_tests/integration_tests/feature_tests/ffront_tests/__init__.py
new file mode 100644
index 0000000000..6c43e2f12a
--- /dev/null
+++ b/tests/next_tests/integration_tests/feature_tests/ffront_tests/__init__.py
@@ -0,0 +1,13 @@
+# GT4Py - GridTools Framework
+#
+# Copyright (c) 2014-2023, ETH Zurich
+# All rights reserved.
+#
+# This file is part of the GT4Py project and the GridTools framework.
+# GT4Py is free software: you can redistribute it and/or modify it under
+# the terms of the GNU General Public License as published by the
+# Free Software Foundation, either version 3 of the License, or any later
+# version. See the LICENSE.txt file at the top-level directory of this
+# distribution for a copy of the license or check <https://www.gnu.org/licenses/>.
+#
+# SPDX-License-Identifier: GPL-3.0-or-later
diff --git a/tests/next_tests/integration_tests/feature_tests/ffront_tests/ffront_test_utils.py b/tests/next_tests/integration_tests/feature_tests/ffront_tests/ffront_test_utils.py
index 383716484e..93296ae85f 100644
--- a/tests/next_tests/integration_tests/feature_tests/ffront_tests/ffront_test_utils.py
+++ b/tests/next_tests/integration_tests/feature_tests/ffront_tests/ffront_test_utils.py
@@ -21,8 +21,8 @@
 
 import gt4py.next as gtx
 from gt4py.next.ffront import decorator
-from gt4py.next.iterator import embedded, ir as itir
-from gt4py.next.program_processors.runners import gtfn_cpu, roundtrip
+from gt4py.next.iterator import ir as itir
+from gt4py.next.program_processors.runners import gtfn, roundtrip
 
 
 try:
@@ -49,9 +49,9 @@ def no_backend(program: itir.FencilDefinition, *args: Any, **kwargs: Any) -> Non
 @pytest.fixture(
     params=[
         roundtrip.executor,
-        gtfn_cpu.run_gtfn,
-        gtfn_cpu.run_gtfn_imperative,
-        gtfn_cpu.run_gtfn_with_temporaries,
+        gtfn.run_gtfn,
+        gtfn.run_gtfn_imperative,
+        gtfn.run_gtfn_with_temporaries,
     ]
     + OPTIONAL_PROCESSORS,
     ids=lambda p: next_tests.get_processor_id(p),
diff --git a/tests/next_tests/integration_tests/feature_tests/ffront_tests/test_arg_call_interface.py b/tests/next_tests/integration_tests/feature_tests/ffront_tests/test_arg_call_interface.py
index 1402649127..deb1382dfb 100644
--- a/tests/next_tests/integration_tests/feature_tests/ffront_tests/test_arg_call_interface.py
+++ b/tests/next_tests/integration_tests/feature_tests/ffront_tests/test_arg_call_interface.py
@@ -20,22 +20,11 @@
 import pytest
 
 from gt4py.next import errors
-from gt4py.next.common import Field
-from gt4py.next.errors.exceptions import TypeError_
 from gt4py.next.ffront.decorator import field_operator, program, scan_operator
-from gt4py.next.ffront.fbuiltins import broadcast, int32, int64
-from gt4py.next.program_processors.runners import gtfn_cpu
+from gt4py.next.ffront.fbuiltins import broadcast, int32
 
 from next_tests.integration_tests import cases
-from next_tests.integration_tests.cases import (
-    IDim,
-    IField,
-    IJKField,
-    IJKFloatField,
-    JDim,
-    KDim,
-    cartesian_case,
-)
+from next_tests.integration_tests.cases import IDim, IField, IJKFloatField, KDim, cartesian_case
 from next_tests.integration_tests.feature_tests.ffront_tests.ffront_test_utils import (
     fieldview_backend,
 )
diff --git a/tests/next_tests/integration_tests/feature_tests/ffront_tests/test_execution.py b/tests/next_tests/integration_tests/feature_tests/ffront_tests/test_execution.py
index 61b34460ef..f974e07ad8 100644
--- a/tests/next_tests/integration_tests/feature_tests/ffront_tests/test_execution.py
+++ b/tests/next_tests/integration_tests/feature_tests/ffront_tests/test_execution.py
@@ -27,13 +27,12 @@
     float64,
     int32,
     int64,
-    maximum,
     minimum,
     neighbor_sum,
     where,
 )
 from gt4py.next.ffront.experimental import as_offset
-from gt4py.next.program_processors.runners import gtfn_cpu
+from gt4py.next.program_processors.runners import gtfn
 
 from next_tests.integration_tests import cases
 from next_tests.integration_tests.cases import (
@@ -526,12 +525,12 @@ def simple_scan_operator(carry: float) -> float:
 @pytest.mark.uses_lift_expressions
 def test_solve_triag(cartesian_case):
     if cartesian_case.backend in [
-        gtfn_cpu.run_gtfn,
-        gtfn_cpu.run_gtfn_imperative,
-        gtfn_cpu.run_gtfn_with_temporaries,
+        gtfn.run_gtfn,
+        gtfn.run_gtfn_imperative,
+        gtfn.run_gtfn_with_temporaries,
     ]:
         pytest.xfail("Nested `scan`s requires creating temporaries.")
-    if cartesian_case.backend == gtfn_cpu.run_gtfn_with_temporaries:
+    if cartesian_case.backend == gtfn.run_gtfn_with_temporaries:
         pytest.xfail("Temporary extraction does not work correctly in combination with scans.")
 
     @gtx.scan_operator(axis=KDim, forward=True, init=(0.0, 0.0))
@@ -630,7 +629,7 @@ def testee(a: cases.EField, b: cases.EField) -> cases.VField:
 
 
 def test_ternary_scan(cartesian_case):
-    if cartesian_case.backend in [gtfn_cpu.run_gtfn_with_temporaries]:
+    if cartesian_case.backend in [gtfn.run_gtfn_with_temporaries]:
         pytest.xfail("Temporary extraction does not work correctly in combination with scans.")
 
     @gtx.scan_operator(axis=KDim, forward=True, init=0.0)
@@ -653,7 +652,7 @@ def simple_scan_operator(carry: float, a: float) -> float:
 @pytest.mark.parametrize("forward", [True, False])
 @pytest.mark.uses_tuple_returns
 def test_scan_nested_tuple_output(forward, cartesian_case):
-    if cartesian_case.backend in [gtfn_cpu.run_gtfn_with_temporaries]:
+    if cartesian_case.backend in [gtfn.run_gtfn_with_temporaries]:
         pytest.xfail("Temporary extraction does not work correctly in combination with scans.")
 
     init = (1, (2, 3))
@@ -690,7 +689,9 @@ def test_scan_nested_tuple_input(cartesian_case):
     inp2 = gtx.np_as_located_field(KDim)(np.arange(0.0, k_size, 1))
     out = gtx.np_as_located_field(KDim)(np.zeros((k_size,)))
 
-    prev_levels_iterator = lambda i: range(i + 1)
+    def prev_levels_iterator(i):
+        return range(i + 1)
+
     expected = np.asarray(
         [
             reduce(lambda prev, i: prev + inp1[i] + inp2[i], prev_levels_iterator(i), init)
@@ -758,9 +759,9 @@ def program_domain(a: cases.IField, out: cases.IField):
 
 def test_domain_input_bounds(cartesian_case):
     if cartesian_case.backend in [
-        gtfn_cpu.run_gtfn,
-        gtfn_cpu.run_gtfn_imperative,
-        gtfn_cpu.run_gtfn_with_temporaries,
+        gtfn.run_gtfn,
+        gtfn.run_gtfn_imperative,
+        gtfn.run_gtfn_with_temporaries,
     ]:
         pytest.xfail("FloorDiv not fully supported in gtfn.")
 
diff --git a/tests/next_tests/integration_tests/feature_tests/ffront_tests/test_gpu_backend.py b/tests/next_tests/integration_tests/feature_tests/ffront_tests/test_gpu_backend.py
new file mode 100644
index 0000000000..290cece3fa
--- /dev/null
+++ b/tests/next_tests/integration_tests/feature_tests/ffront_tests/test_gpu_backend.py
@@ -0,0 +1,43 @@
+# GT4Py - GridTools Framework
+#
+# Copyright (c) 2014-2023, ETH Zurich
+# All rights reserved.
+#
+# This file is part of the GT4Py project and the GridTools framework.
+# GT4Py is free software: you can redistribute it and/or modify it under
+# the terms of the GNU General Public License as published by the
+# Free Software Foundation, either version 3 of the License, or any later
+# version. See the LICENSE.txt file at the top-level directory of this
+# distribution for a copy of the license or check <https://www.gnu.org/licenses/>.
+#
+# SPDX-License-Identifier: GPL-3.0-or-later
+
+import pytest
+
+import gt4py.next as gtx
+from gt4py.next.iterator import embedded
+from gt4py.next.program_processors.runners import gtfn
+
+from next_tests.integration_tests import cases
+from next_tests.integration_tests.cases import cartesian_case  # noqa: F401
+from next_tests.integration_tests.feature_tests.ffront_tests.ffront_test_utils import (  # noqa: F401
+    fieldview_backend,
+)
+
+
+@pytest.mark.requires_gpu
+@pytest.mark.parametrize("fieldview_backend", [gtfn.run_gtfn_gpu])
+def test_copy(cartesian_case, fieldview_backend):  # noqa: F811 # fixtures
+    import cupy as cp  # TODO(ricoh): replace with storages solution when available
+
+    @gtx.field_operator(backend=fieldview_backend)
+    def testee(a: cases.IJKField) -> cases.IJKField:
+        return a
+
+    inp_arr = cp.full(shape=(3, 4, 5), fill_value=3, dtype=cp.int32)
+    outp_arr = cp.zeros_like(inp_arr)
+    inp = embedded.np_as_located_field(cases.IDim, cases.JDim, cases.KDim)(inp_arr)
+    outp = embedded.np_as_located_field(cases.IDim, cases.JDim, cases.KDim)(outp_arr)
+
+    testee(inp, out=outp, offset_provider={})
+    assert cp.allclose(inp_arr, outp_arr)
diff --git a/tests/next_tests/integration_tests/feature_tests/ffront_tests/test_gt4py_builtins.py b/tests/next_tests/integration_tests/feature_tests/ffront_tests/test_gt4py_builtins.py
index 0ae874f3a6..56d5e35b3a 100644
--- a/tests/next_tests/integration_tests/feature_tests/ffront_tests/test_gt4py_builtins.py
+++ b/tests/next_tests/integration_tests/feature_tests/ffront_tests/test_gt4py_builtins.py
@@ -17,8 +17,8 @@
 import pytest
 
 import gt4py.next as gtx
-from gt4py.next import broadcast, float64, int32, int64, max_over, min_over, neighbor_sum, where
-from gt4py.next.program_processors.runners import gtfn_cpu
+from gt4py.next import broadcast, float64, int32, max_over, min_over, neighbor_sum, where
+from gt4py.next.program_processors.runners import gtfn
 
 from next_tests.integration_tests import cases
 from next_tests.integration_tests.cases import (
@@ -30,7 +30,6 @@
     Joff,
     KDim,
     V2EDim,
-    Vertex,
     cartesian_case,
     unstructured_case,
 )
@@ -47,9 +46,9 @@
 )
 def test_maxover_execution_(unstructured_case, strategy):
     if unstructured_case.backend in [
-        gtfn_cpu.run_gtfn,
-        gtfn_cpu.run_gtfn_imperative,
-        gtfn_cpu.run_gtfn_with_temporaries,
+        gtfn.run_gtfn,
+        gtfn.run_gtfn_imperative,
+        gtfn.run_gtfn_with_temporaries,
     ]:
         pytest.xfail("`maxover` broken in gtfn, see #1289.")
 
diff --git a/tests/next_tests/integration_tests/feature_tests/ffront_tests/test_math_unary_builtins.py b/tests/next_tests/integration_tests/feature_tests/ffront_tests/test_math_unary_builtins.py
index 85826c1ac0..034ce56fee 100644
--- a/tests/next_tests/integration_tests/feature_tests/ffront_tests/test_math_unary_builtins.py
+++ b/tests/next_tests/integration_tests/feature_tests/ffront_tests/test_math_unary_builtins.py
@@ -37,7 +37,7 @@
     tanh,
     trunc,
 )
-from gt4py.next.program_processors.runners import gtfn_cpu
+from gt4py.next.program_processors.runners import gtfn
 
 from next_tests.integration_tests import cases
 from next_tests.integration_tests.cases import IDim, cartesian_case, unstructured_case
@@ -69,9 +69,9 @@ def pow(inp1: cases.IField) -> cases.IField:
 
 def test_floordiv(cartesian_case):
     if cartesian_case.backend in [
-        gtfn_cpu.run_gtfn,
-        gtfn_cpu.run_gtfn_imperative,
-        gtfn_cpu.run_gtfn_with_temporaries,
+        gtfn.run_gtfn,
+        gtfn.run_gtfn_imperative,
+        gtfn.run_gtfn_with_temporaries,
     ]:
         pytest.xfail(
             "FloorDiv not yet supported."
diff --git a/tests/next_tests/integration_tests/feature_tests/iterator_tests/test_builtins.py b/tests/next_tests/integration_tests/feature_tests/iterator_tests/test_builtins.py
index ca29c5b18b..e2bbbaa553 100644
--- a/tests/next_tests/integration_tests/feature_tests/iterator_tests/test_builtins.py
+++ b/tests/next_tests/integration_tests/feature_tests/iterator_tests/test_builtins.py
@@ -52,7 +52,7 @@
     xor_,
 )
 from gt4py.next.iterator.runtime import closure, fendef, fundef, offset
-from gt4py.next.program_processors.runners.gtfn_cpu import run_gtfn
+from gt4py.next.program_processors.runners.gtfn import run_gtfn
 
 from next_tests.integration_tests.feature_tests.math_builtin_test_data import math_builtin_test_data
 from next_tests.unit_tests.conftest import program_processor, run_processor
diff --git a/tests/next_tests/integration_tests/multi_feature_tests/ffront_tests/test_icon_like_scan.py b/tests/next_tests/integration_tests/multi_feature_tests/ffront_tests/test_icon_like_scan.py
index 8db9a4c36e..64fb238470 100644
--- a/tests/next_tests/integration_tests/multi_feature_tests/ffront_tests/test_icon_like_scan.py
+++ b/tests/next_tests/integration_tests/multi_feature_tests/ffront_tests/test_icon_like_scan.py
@@ -18,7 +18,7 @@
 import pytest
 
 import gt4py.next as gtx
-from gt4py.next.program_processors.runners import gtfn_cpu, roundtrip
+from gt4py.next.program_processors.runners import gtfn, roundtrip
 
 from next_tests.integration_tests.feature_tests.ffront_tests.ffront_test_utils import (
     fieldview_backend,
@@ -214,9 +214,9 @@ class setup:
 @pytest.mark.uses_tuple_returns
 def test_solve_nonhydro_stencil_52_like_z_q(test_setup, fieldview_backend):
     if fieldview_backend in [
-        gtfn_cpu.run_gtfn,
-        gtfn_cpu.run_gtfn_imperative,
-        gtfn_cpu.run_gtfn_with_temporaries,
+        gtfn.run_gtfn,
+        gtfn.run_gtfn_imperative,
+        gtfn.run_gtfn_with_temporaries,
     ]:
         pytest.xfail("Needs implementation of scan projector.")
 
@@ -234,7 +234,7 @@ def test_solve_nonhydro_stencil_52_like_z_q(test_setup, fieldview_backend):
 
 @pytest.mark.uses_tuple_returns
 def test_solve_nonhydro_stencil_52_like_z_q_tup(test_setup, fieldview_backend):
-    if fieldview_backend in [gtfn_cpu.run_gtfn_with_temporaries]:
+    if fieldview_backend in [gtfn.run_gtfn_with_temporaries]:
         pytest.xfail(
             "Needs implementation of scan projector. Breaks in type inference as executed"
             "again after CollapseTuple."
@@ -256,7 +256,7 @@ def test_solve_nonhydro_stencil_52_like_z_q_tup(test_setup, fieldview_backend):
 
 @pytest.mark.uses_tuple_returns
 def test_solve_nonhydro_stencil_52_like(test_setup, fieldview_backend):
-    if fieldview_backend in [gtfn_cpu.run_gtfn_with_temporaries]:
+    if fieldview_backend in [gtfn.run_gtfn_with_temporaries]:
         pytest.xfail("Temporary extraction does not work correctly in combination with scans.")
     solve_nonhydro_stencil_52_like.with_backend(fieldview_backend)(
         test_setup.z_alpha,
@@ -273,7 +273,7 @@ def test_solve_nonhydro_stencil_52_like(test_setup, fieldview_backend):
 
 @pytest.mark.uses_tuple_returns
 def test_solve_nonhydro_stencil_52_like_with_gtfn_tuple_merge(test_setup, fieldview_backend):
-    if fieldview_backend in [gtfn_cpu.run_gtfn_with_temporaries]:
+    if fieldview_backend in [gtfn.run_gtfn_with_temporaries]:
         pytest.xfail("Temporary extraction does not work correctly in combination with scans.")
     if fieldview_backend == roundtrip.executor:
         pytest.xfail("Needs proper handling of tuple[Column] <-> Column[tuple].")
diff --git a/tests/next_tests/integration_tests/multi_feature_tests/iterator_tests/test_anton_toy.py b/tests/next_tests/integration_tests/multi_feature_tests/iterator_tests/test_anton_toy.py
index 16d839a8ab..4e295e92af 100644
--- a/tests/next_tests/integration_tests/multi_feature_tests/iterator_tests/test_anton_toy.py
+++ b/tests/next_tests/integration_tests/multi_feature_tests/iterator_tests/test_anton_toy.py
@@ -18,7 +18,7 @@
 import gt4py.next as gtx
 from gt4py.next.iterator.builtins import cartesian_domain, deref, lift, named_range, shift
 from gt4py.next.iterator.runtime import closure, fendef, fundef, offset
-from gt4py.next.program_processors.runners import gtfn_cpu
+from gt4py.next.program_processors.runners import gtfn
 
 from next_tests.unit_tests.conftest import lift_mode, program_processor, run_processor
 
@@ -79,9 +79,9 @@ def test_anton_toy(program_processor, lift_mode):
     program_processor, validate = program_processor
 
     if program_processor in [
-        gtfn_cpu.run_gtfn,
-        gtfn_cpu.run_gtfn_imperative,
-        gtfn_cpu.run_gtfn_with_temporaries,
+        gtfn.run_gtfn,
+        gtfn.run_gtfn_imperative,
+        gtfn.run_gtfn_with_temporaries,
     ]:
         from gt4py.next.iterator import transforms
 
diff --git a/tests/next_tests/integration_tests/multi_feature_tests/iterator_tests/test_fvm_nabla.py b/tests/next_tests/integration_tests/multi_feature_tests/iterator_tests/test_fvm_nabla.py
index 42de13ef44..445b73548b 100644
--- a/tests/next_tests/integration_tests/multi_feature_tests/iterator_tests/test_fvm_nabla.py
+++ b/tests/next_tests/integration_tests/multi_feature_tests/iterator_tests/test_fvm_nabla.py
@@ -16,7 +16,7 @@
 import pytest
 
 
-pytest.importorskip("atlas4py")
+pytest.importorskip("atlas4py")  # isort: skip
 
 import gt4py.next as gtx
 from gt4py.next.iterator import library
@@ -37,7 +37,6 @@
 )
 from gt4py.next.iterator.runtime import closure, fendef, fundef, offset
 from gt4py.next.iterator.transforms.pass_manager import LiftMode
-from gt4py.next.program_processors.runners import gtfn_cpu
 
 from next_tests.integration_tests.multi_feature_tests.iterator_tests.fvm_nabla_setup import (
     assert_close,
diff --git a/tests/next_tests/integration_tests/multi_feature_tests/iterator_tests/test_hdiff.py b/tests/next_tests/integration_tests/multi_feature_tests/iterator_tests/test_hdiff.py
index 7bd028b7c3..af70dd590f 100644
--- a/tests/next_tests/integration_tests/multi_feature_tests/iterator_tests/test_hdiff.py
+++ b/tests/next_tests/integration_tests/multi_feature_tests/iterator_tests/test_hdiff.py
@@ -18,7 +18,7 @@
 import gt4py.next as gtx
 from gt4py.next.iterator.builtins import *
 from gt4py.next.iterator.runtime import closure, fendef, fundef, offset
-from gt4py.next.program_processors.runners import gtfn_cpu
+from gt4py.next.program_processors.runners import gtfn
 
 from next_tests.integration_tests.cases import IDim, JDim
 from next_tests.integration_tests.multi_feature_tests.iterator_tests.hdiff_reference import (
@@ -75,9 +75,9 @@ def hdiff(inp, coeff, out, x, y):
 def test_hdiff(hdiff_reference, program_processor, lift_mode):
     program_processor, validate = program_processor
     if program_processor in [
-        gtfn_cpu.run_gtfn,
-        gtfn_cpu.run_gtfn_imperative,
-        gtfn_cpu.run_gtfn_with_temporaries,
+        gtfn.run_gtfn,
+        gtfn.run_gtfn_imperative,
+        gtfn.run_gtfn_with_temporaries,
     ]:
         # TODO(tehrengruber): check if still true
         from gt4py.next.iterator import transforms
diff --git a/tests/next_tests/integration_tests/multi_feature_tests/iterator_tests/test_vertical_advection.py b/tests/next_tests/integration_tests/multi_feature_tests/iterator_tests/test_vertical_advection.py
index f11046cb5d..a0471e8baa 100644
--- a/tests/next_tests/integration_tests/multi_feature_tests/iterator_tests/test_vertical_advection.py
+++ b/tests/next_tests/integration_tests/multi_feature_tests/iterator_tests/test_vertical_advection.py
@@ -19,10 +19,8 @@
 from gt4py.next.iterator.builtins import *
 from gt4py.next.iterator.runtime import closure, fendef, fundef
 from gt4py.next.iterator.transforms import LiftMode
-from gt4py.next.program_processors.formatters.gtfn import (
-    format_sourcecode as gtfn_format_sourcecode,
-)
-from gt4py.next.program_processors.runners import gtfn_cpu
+from gt4py.next.program_processors.formatters import gtfn as gtfn_formatters
+from gt4py.next.program_processors.runners import gtfn
 
 from next_tests.integration_tests.cases import IDim, JDim, KDim
 from next_tests.unit_tests.conftest import lift_mode, program_processor, run_processor
@@ -121,16 +119,16 @@ def test_tridiag(fencil, tridiag_reference, program_processor, lift_mode):
     if (
         program_processor
         in [
-            gtfn_cpu.run_gtfn,
-            gtfn_cpu.run_gtfn_imperative,
-            gtfn_cpu.run_gtfn_with_temporaries,
-            gtfn_format_sourcecode,
+            gtfn.run_gtfn,
+            gtfn.run_gtfn_imperative,
+            gtfn.run_gtfn_with_temporaries,
+            gtfn_formatters.format_sourcecode,
         ]
         and lift_mode == LiftMode.FORCE_INLINE
     ):
         pytest.skip("gtfn does only support lifted scans when using temporaries")
     if (
-        program_processor == gtfn_cpu.run_gtfn_with_temporaries
+        program_processor == gtfn.run_gtfn_with_temporaries
         or lift_mode == LiftMode.FORCE_TEMPORARIES
     ):
         pytest.xfail("tuple_get on columns not supported.")
diff --git a/tests/next_tests/integration_tests/multi_feature_tests/iterator_tests/test_with_toy_connectivity.py b/tests/next_tests/integration_tests/multi_feature_tests/iterator_tests/test_with_toy_connectivity.py
index 92b93ddb63..d475fab3a8 100644
--- a/tests/next_tests/integration_tests/multi_feature_tests/iterator_tests/test_with_toy_connectivity.py
+++ b/tests/next_tests/integration_tests/multi_feature_tests/iterator_tests/test_with_toy_connectivity.py
@@ -30,15 +30,13 @@
     shift,
 )
 from gt4py.next.iterator.runtime import fundef
-from gt4py.next.program_processors.formatters import gtfn
-from gt4py.next.program_processors.runners import gtfn_cpu
+from gt4py.next.program_processors.runners import gtfn
 
 from next_tests.toy_connectivity import (
     C2E,
     E2V,
     V2E,
     V2V,
-    C2EDim,
     Cell,
     E2VDim,
     Edge,
@@ -409,9 +407,9 @@ def shift_sparse_stencil2(inp):
 def test_shift_sparse_input_field2(program_processor, lift_mode):
     program_processor, validate = program_processor
     if program_processor in [
-        gtfn_cpu.run_gtfn,
-        gtfn_cpu.run_gtfn_imperative,
-        gtfn_cpu.run_gtfn_with_temporaries,
+        gtfn.run_gtfn,
+        gtfn.run_gtfn_imperative,
+        gtfn.run_gtfn_with_temporaries,
     ]:
         pytest.xfail(
             "Bug in bindings/compilation/caching: only the first program seems to be compiled."
diff --git a/tests/next_tests/integration_tests/multi_feature_tests/otf_tests/test_gtfn_workflow.py b/tests/next_tests/integration_tests/multi_feature_tests/otf_tests/test_gtfn_workflow.py
index 4e456637cf..c60079eaf1 100644
--- a/tests/next_tests/integration_tests/multi_feature_tests/otf_tests/test_gtfn_workflow.py
+++ b/tests/next_tests/integration_tests/multi_feature_tests/otf_tests/test_gtfn_workflow.py
@@ -14,7 +14,7 @@
 import numpy as np
 
 import gt4py.next as gtx
-from gt4py.next.program_processors.runners import gtfn_cpu
+from gt4py.next.program_processors.runners import gtfn
 
 from next_tests.integration_tests.cases import IDim, JDim
 
@@ -37,7 +37,7 @@ def test_different_buffer_sizes():
     )
     out = gtx.np_as_located_field(IDim, JDim)(np.zeros((out_nx, out_ny), dtype=np.int32))
 
-    @gtx.field_operator(backend=gtfn_cpu.run_gtfn)
+    @gtx.field_operator(backend=gtfn.run_gtfn)
     def copy(inp: gtx.Field[[IDim, JDim], gtx.int32]) -> gtx.Field[[IDim, JDim], gtx.int32]:
         return inp
 
diff --git a/tests/next_tests/unit_tests/conftest.py b/tests/next_tests/unit_tests/conftest.py
index 7a62778be1..747431599a 100644
--- a/tests/next_tests/unit_tests/conftest.py
+++ b/tests/next_tests/unit_tests/conftest.py
@@ -22,8 +22,8 @@
 from gt4py import eve
 from gt4py.next.iterator import ir as itir, pretty_parser, pretty_printer, runtime, transforms
 from gt4py.next.program_processors import processor_interface as ppi
-from gt4py.next.program_processors.formatters import gtfn, lisp, type_check
-from gt4py.next.program_processors.runners import double_roundtrip, gtfn_cpu, roundtrip
+from gt4py.next.program_processors.formatters import gtfn as gtfn_formatters, lisp, type_check
+from gt4py.next.program_processors.runners import double_roundtrip, gtfn, roundtrip
 
 
 try:
@@ -78,10 +78,10 @@ def pretty_format_and_check(root: itir.FencilDefinition, *args, **kwargs) -> str
         (roundtrip.executor, True),
         (type_check.check, False),
         (double_roundtrip.executor, True),
-        (gtfn_cpu.run_gtfn, True),
-        (gtfn_cpu.run_gtfn_imperative, True),
-        (gtfn_cpu.run_gtfn_with_temporaries, True),
-        (gtfn.format_sourcecode, False),
+        (gtfn.run_gtfn, True),
+        (gtfn.run_gtfn_imperative, True),
+        (gtfn.run_gtfn_with_temporaries, True),
+        (gtfn_formatters.format_sourcecode, False),
     ]
     + OPTIONAL_PROCESSORS,
     ids=lambda p: next_tests.get_processor_id(p[0]),
diff --git a/tests/next_tests/unit_tests/otf_tests/compilation_tests/build_systems_tests/conftest.py b/tests/next_tests/unit_tests/otf_tests/compilation_tests/build_systems_tests/conftest.py
index 1fab2643b5..45ef85e37c 100644
--- a/tests/next_tests/unit_tests/otf_tests/compilation_tests/build_systems_tests/conftest.py
+++ b/tests/next_tests/unit_tests/otf_tests/compilation_tests/build_systems_tests/conftest.py
@@ -78,7 +78,7 @@ def make_program_source(name: str) -> stages.ProgramSource:
         entry_point=entry_point,
         source_code=src,
         library_deps=[
-            interface.LibraryDependency("gridtools", "master"),
+            interface.LibraryDependency("gridtools_cpu", "master"),
         ],
         language=languages.Cpp,
         language_settings=cpp_interface.CPP_DEFAULT,
diff --git a/tests/next_tests/unit_tests/program_processor_tests/codegens_tests/gtfn_tests/test_gtfn_module.py b/tests/next_tests/unit_tests/program_processor_tests/codegens_tests/gtfn_tests/test_gtfn_module.py
index 93be884687..ae5f582e47 100644
--- a/tests/next_tests/unit_tests/program_processor_tests/codegens_tests/gtfn_tests/test_gtfn_module.py
+++ b/tests/next_tests/unit_tests/program_processor_tests/codegens_tests/gtfn_tests/test_gtfn_module.py
@@ -65,9 +65,9 @@ def fencil_example():
 
 def test_codegen(fencil_example):
     fencil, parameters = fencil_example
-    module = gtfn_module.translate_program(
+    module = gtfn_module.translate_program_cpu(
         stages.ProgramCall(fencil, parameters, {"offset_provider": {}})
     )
     assert module.entry_point.name == fencil.id
-    assert any(d.name == "gridtools" for d in module.library_deps)
+    assert any(d.name == "gridtools_cpu" for d in module.library_deps)
     assert module.language is languages.Cpp
diff --git a/tox.ini b/tox.ini
index e16aaff27f..18a6ff8e84 100644
--- a/tox.ini
+++ b/tox.ini
@@ -71,7 +71,7 @@ commands =
     python -m pytest --cache-clear -v -n {env:NUM_PROCESSES:1} {posargs} tests{/}eve_tests
     python -m pytest --doctest-modules src{/}gt4py{/}eve
 
-[testenv:next-py{310}-{nomesh,atlas}]
+[testenv:next-py{310}-{nomesh,atlas}-{cpu,cuda,cuda11x,cuda12x}]
 description = Run 'gt4py.next' tests
 pass_env = {[testenv]pass_env}, BOOST_ROOT, BOOST_HOME, CUDA_HOME, CUDA_PATH
 deps =
@@ -81,8 +81,10 @@ set_env =
     {[testenv]set_env}
     PIP_EXTRA_INDEX_URL = {env:PIP_EXTRA_INDEX_URL:https://test.pypi.org/simple/}
 commands =
-    nomesh: python -m pytest --cache-clear -v -n {env:NUM_PROCESSES:1} -m "not requires_atlas" {posargs} tests{/}next_tests
-    atlas: python -m pytest --cache-clear -v -n {env:NUM_PROCESSES:1} -m "requires_atlas" {posargs} tests{/}next_tests
+    nomesh-cpu: python -m pytest --cache-clear -v -n {env:NUM_PROCESSES:1} -m "not requires_atlas and not requires_gpu" {posargs} tests{/}next_tests
+    nomesh-gpu: python -m pytest --cache-clear -v -n {env:NUM_PROCESSES:1} -m "not requires_atlas and requires_gpu" {posargs} tests{/}next_tests
+    atlas-cpu: python -m pytest --cache-clear -v -n {env:NUM_PROCESSES:1} -m "requires_atlas and not requires_gpu" {posargs} tests{/}next_tests
+    atlas-gpu: python -m pytest --cache-clear -v -n {env:NUM_PROCESSES:1} -m "requires_atlas and requires_gpu" {posargs} tests{/}next_tests
     pytest --doctest-modules src{/}gt4py{/}next
 
 [testenv:storage-py{38,39,310}-{internal,dace}-{cpu,cuda,cuda11x,cuda12x}]