-
Notifications
You must be signed in to change notification settings - Fork 48
Commit
This commit does not belong to any branch on this repository, and may belong to a fork outside of the repository.
Add
rapids_test
allowing projects to run gpu tests in parallel (#328)
Introduces `rapids_test` functionality to allow tests executed via `ctest -j` to properly resource share GPUs. This is done by having tests state how many GPUs allocations they require, and uses CTest internal job scheduler to properly load balance. Authors: - Robert Maynard (https://github.com/robertmaynard) Approvers: - AJ Schmidt (https://github.com/ajschmidt8) - Bradley Dice (https://github.com/bdice) - Vyas Ramasubramani (https://github.com/vyasr) URL: #328
- Loading branch information
1 parent
e6a4d70
commit f7876e6
Showing
70 changed files
with
2,492 additions
and
4 deletions.
There are no files selected for viewing
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1 @@ | ||
.. cmake-module:: ../../rapids-cmake/test/add.cmake |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1 @@ | ||
.. cmake-module:: ../../rapids-cmake/test/generate_resource_spec.cmake |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1 @@ | ||
.. cmake-module:: ../../rapids-cmake/test/gpu_requirements.cmake |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1 @@ | ||
.. cmake-module:: ../../rapids-cmake/test/init.cmake |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1 @@ | ||
.. cmake-module:: ../../rapids-cmake/test/install_relocatable.cmake |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,96 @@ | ||
/* | ||
* Copyright (c) 2022-2023, NVIDIA CORPORATION. | ||
* | ||
* Licensed under the Apache License, Version 2.0 (the "License"); | ||
* you may not use this file except in compliance with the License. | ||
* You may obtain a copy of the License at | ||
* | ||
* http://www.apache.org/licenses/LICENSE-2.0 | ||
* | ||
* Unless required by applicable law or agreed to in writing, software | ||
* distributed under the License is distributed on an "AS IS" BASIS, | ||
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. | ||
* See the License for the specific language governing permissions and | ||
* limitations under the License. | ||
*/ | ||
|
||
#include <rapids_cmake_ctest_allocation.hpp> | ||
|
||
#include <cuda_runtime_api.h> | ||
|
||
#include <algorithm> | ||
#include <cstdlib> | ||
#include <numeric> | ||
#include <string> | ||
#include <string_view> | ||
|
||
namespace rapids_cmake { | ||
|
||
namespace { | ||
GPUAllocation noGPUAllocation() { return GPUAllocation{-1, -1}; } | ||
|
||
GPUAllocation parseCTestAllocation(std::string_view env_variable) | ||
{ | ||
std::string gpu_resources{std::getenv(env_variable.begin())}; | ||
// need to handle parseCTestAllocation variable being empty | ||
|
||
// need to handle parseCTestAllocation variable not having some | ||
// of the requested components | ||
|
||
// The string looks like "id:<number>,slots:<number>" | ||
auto id_start = gpu_resources.find("id:") + 3; | ||
auto id_end = gpu_resources.find(","); | ||
auto slot_start = gpu_resources.find("slots:") + 6; | ||
|
||
auto id = gpu_resources.substr(id_start, id_end - id_start); | ||
auto slots = gpu_resources.substr(slot_start); | ||
|
||
return GPUAllocation{std::stoi(id), std::stoi(slots)}; | ||
} | ||
|
||
std::vector<GPUAllocation> determineGPUAllocations() | ||
{ | ||
std::vector<GPUAllocation> allocations; | ||
const auto* resource_count = std::getenv("CTEST_RESOURCE_GROUP_COUNT"); | ||
if (!resource_count) { | ||
allocations.emplace_back(); | ||
return allocations; | ||
} | ||
|
||
const auto resource_max = std::stoi(resource_count); | ||
for (int index = 0; index < resource_max; ++index) { | ||
std::string group_env = "CTEST_RESOURCE_GROUP_" + std::to_string(index); | ||
std::string resource_group{std::getenv(group_env.c_str())}; | ||
std::transform(resource_group.begin(), resource_group.end(), resource_group.begin(), ::toupper); | ||
|
||
if (resource_group == "GPUS") { | ||
auto resource_env = group_env + "_" + resource_group; | ||
auto&& allocation = parseCTestAllocation(resource_env); | ||
allocations.emplace_back(allocation); | ||
} | ||
} | ||
|
||
return allocations; | ||
} | ||
} // namespace | ||
|
||
bool using_resources() | ||
{ | ||
const auto* resource_count = std::getenv("CTEST_RESOURCE_GROUP_COUNT"); | ||
return resource_count != nullptr; | ||
} | ||
|
||
std::vector<GPUAllocation> full_allocation() { return determineGPUAllocations(); } | ||
|
||
cudaError_t bind_to_gpu(GPUAllocation const& alloc) { return cudaSetDevice(alloc.device_id); } | ||
|
||
bool bind_to_first_gpu() | ||
{ | ||
if (using_resources()) { | ||
std::vector<GPUAllocation> allocs = determineGPUAllocations(); | ||
return (bind_to_gpu(allocs[0]) == cudaSuccess); | ||
} | ||
return false; | ||
} | ||
|
||
} // namespace rapids_cmake |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,89 @@ | ||
/* | ||
* Copyright (c) 2022-2023, NVIDIA CORPORATION. | ||
* | ||
* Licensed under the Apache License, Version 2.0 (the "License"); | ||
* you may not use this file except in compliance with the License. | ||
* You may obtain a copy of the License at | ||
* | ||
* http://www.apache.org/licenses/LICENSE-2.0 | ||
* | ||
* Unless required by applicable law or agreed to in writing, software | ||
* distributed under the License is distributed on an "AS IS" BASIS, | ||
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. | ||
* See the License for the specific language governing permissions and | ||
* limitations under the License. | ||
*/ | ||
|
||
#pragma once | ||
|
||
#include <cuda_runtime_api.h> | ||
#include <vector> | ||
|
||
namespace rapids_cmake { | ||
|
||
/* | ||
* Represents a GPU Allocation provided by a CTest resource specification. | ||
* | ||
* The `device_id` maps to the CUDA gpu id required by `cudaSetDevice`. | ||
* The slots represent the percentage of the GPU that this test will use. | ||
* Primarily used by CTest to ensure proper load balancing of tests. | ||
*/ | ||
struct GPUAllocation { | ||
int device_id; | ||
int slots; | ||
}; | ||
|
||
/* | ||
* Returns true when a CTest resource specification has been specified. | ||
* | ||
* Since the vast majority of tests should execute without a CTest resource | ||
* spec (e.g. when executed manually by a developer), callers of `rapids_cmake` | ||
* should first ensure that a CTestresource spec file has been provided before | ||
* trying to query/bind to the allocation. | ||
* | ||
* ```cxx | ||
* if (rapids_cmake::using_resouces()) { | ||
* rapids_cmake::bind_to_first_gpu(); | ||
* } | ||
* ``` | ||
*/ | ||
bool using_resources(); | ||
|
||
/* | ||
* Returns all GPUAllocations allocated for a test | ||
* | ||
* To support multi-GPU tests the CTest resource specification allows a | ||
* test to request multiple GPUs. As CUDA only allows binding to a | ||
* single GPU at any time, this API allows tests to know what CUDA | ||
* devices they should bind to. | ||
* | ||
* Note: The `device_id` of each allocation might not be unique. | ||
* If a test says it needs 50% of two GPUs, it could be allocated | ||
* the same physical GPU. If a test needs distinct / unique devices | ||
* it must request 51%+ of a device. | ||
* | ||
* Note: rapids_cmake does no caching, so this query should be cached | ||
* instead of called multiple times. | ||
*/ | ||
std::vector<GPUAllocation> full_allocation(); | ||
|
||
/* | ||
* Have CUDA bind to a given GPUAllocation | ||
* | ||
* Have CUDA bind to the `device_id` specified in the CTest | ||
* GPU allocation | ||
* | ||
* Note: Return value is the cudaError_t of `cudaSetDevice` | ||
*/ | ||
cudaError_t bind_to_gpu(GPUAllocation const& alloc); | ||
|
||
/* | ||
* Convenience method to bind to the first GPU that CTest has allocated | ||
* Provided as most RAPIDS tests only require a single GPU | ||
* | ||
* Will return `false` if no GPUs have been allocated, or if setting | ||
* the CUDA device failed for any reason. | ||
*/ | ||
bool bind_to_first_gpu(); | ||
|
||
} // namespace rapids_cmake |
Oops, something went wrong.