Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

New configuration to limit the arena extension #15983

Merged
merged 1 commit into from
May 25, 2023
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
6 changes: 3 additions & 3 deletions cmake/CMakeLists.txt
Original file line number Diff line number Diff line change
Expand Up @@ -805,7 +805,7 @@ function(onnxruntime_set_compile_flags target_name)
set_target_properties(${target_name} PROPERTIES COMPILE_WARNING_AS_ERROR ON)
if (onnxruntime_USE_CUDA)
# Suppress a "conversion_function_not_usable" warning in gsl/span
target_compile_options(${target_name} PRIVATE "$<$<COMPILE_LANGUAGE:CUDA>:SHELL:--diag-suppress 554>")
target_compile_options(${target_name} PRIVATE "$<$<COMPILE_LANGUAGE:CUDA>:SHELL:-Xcudafe \"--diag_suppress=conversion_function_not_usable\">")
endif()
if (MSVC)
foreach(CMAKE_CUDA_TOOLKIT_INCLUDE_DIRECTORY ${CMAKE_CUDA_TOOLKIT_INCLUDE_DIRECTORIES})
Expand Down Expand Up @@ -852,7 +852,7 @@ function(onnxruntime_set_compile_flags target_name)
target_compile_options(${target_name} PRIVATE "$<$<COMPILE_LANGUAGE:CUDA>:SHELL:--compiler-options /analyze:external->" "$<$<COMPILE_LANGUAGE:CXX,C>:/analyze:external->")
target_compile_options(${target_name} PRIVATE "$<$<COMPILE_LANGUAGE:CUDA>:SHELL:--compiler-options /wd6385>" )
# There are many such warnings from STL:
# include\list(148): warning C6011: Dereferencing NULL pointer '_Mycont'. : Lines: 146, 147, 148
# include\list(148): warning C6011: Dereferencing NULL pointer '_Mycont'. : Lines: 146, 147, 148
target_compile_options(${target_name} PRIVATE "$<$<COMPILE_LANGUAGE:CUDA>:SHELL:--compiler-options /wd6011>" )
endif()
else()
Expand Down Expand Up @@ -920,7 +920,7 @@ endfunction()
function(onnxruntime_configure_target target_name)
target_link_directories(${target_name} PRIVATE ${onnxruntime_LINK_DIRS})
onnxruntime_set_compile_flags(${target_name})
onnxruntime_set_source_file_properties(${target_name})
onnxruntime_set_source_file_properties(${target_name})
if(WIN32 AND onnxruntime_ENABLE_STATIC_ANALYSIS AND onnxruntime_USE_CUSTOM_STATIC_ANALYSIS_RULES)
set_target_properties(${target_name} PROPERTIES VS_USER_PROPS ${PROJECT_SOURCE_DIR}/EnableVisualStudioCodeAnalysis.props)
endif()
Expand Down
22 changes: 13 additions & 9 deletions include/onnxruntime/core/framework/allocator.h
Original file line number Diff line number Diff line change
Expand Up @@ -16,20 +16,24 @@ struct OrtArenaCfg {
arena_extend_strategy(-1),
initial_chunk_size_bytes(-1),
max_dead_bytes_per_chunk(-1),
initial_growth_chunk_size_bytes(-1) {}
initial_growth_chunk_size_bytes(-1),
max_power_of_two_extend_bytes(-1) {}
OrtArenaCfg(size_t max_mem, int arena_extend_strategy, int initial_chunk_size_bytes,
int max_dead_bytes_per_chunk, int initial_growth_chunk_size_bytes)
int max_dead_bytes_per_chunk, int initial_growth_chunk_size_bytes,
int64_t max_power_of_two_extend_bytes)
: max_mem(max_mem),
arena_extend_strategy(arena_extend_strategy),
initial_chunk_size_bytes(initial_chunk_size_bytes),
max_dead_bytes_per_chunk(max_dead_bytes_per_chunk),
initial_growth_chunk_size_bytes(initial_growth_chunk_size_bytes) {}

size_t max_mem; // use 0 to allow ORT to choose the default
int arena_extend_strategy; // use -1 to allow ORT to choose the default, 0 = kNextPowerOfTwo, 1 = kSameAsRequested
int initial_chunk_size_bytes; // use -1 to allow ORT to choose the default
int max_dead_bytes_per_chunk; // use -1 to allow ORT to choose the default
int initial_growth_chunk_size_bytes; // use -1 to allow ORT to choose the default
initial_growth_chunk_size_bytes(initial_growth_chunk_size_bytes),
max_power_of_two_extend_bytes(max_power_of_two_extend_bytes) {}

size_t max_mem; // use 0 to allow ORT to choose the default
int arena_extend_strategy; // use -1 to allow ORT to choose the default, 0 = kNextPowerOfTwo, 1 = kSameAsRequested
int initial_chunk_size_bytes; // use -1 to allow ORT to choose the default
int max_dead_bytes_per_chunk; // use -1 to allow ORT to choose the default
int initial_growth_chunk_size_bytes; // use -1 to allow ORT to choose the default
int64_t max_power_of_two_extend_bytes; // use -1 to allow ORT to choose the default
};

namespace onnxruntime {
Expand Down
4 changes: 4 additions & 0 deletions include/onnxruntime/core/session/onnxruntime_c_api.h
Original file line number Diff line number Diff line change
Expand Up @@ -2728,6 +2728,10 @@ struct OrtApi {
* crossing which the current chunk is chunked into 2.
* "initial_growth_chunk_size_bytes": (Possible) Size of the second allocation in the arena.
* Only relevant if arena strategy is `kNextPowerOfTwo`. Use -1 to allow ORT to choose the default.
* "max_power_of_two_extend_bytes": The maximum enxtend size if arena strategy is `kNextPowerOfTwo`.
* It is not an allocation limit, it is only a limit for extention when requested byte is less than the limit.
* When requested bytes is more than the limit, allocator will still return as requested.
* Use -1 to allow ORT to choose the default 1GB for max_power_of_two_extend_bytes.
* Ultimately, the allocation size is determined by the allocation memory request.
* Further allocation sizes are governed by the arena extend strategy.
*
Expand Down
1 change: 1 addition & 0 deletions include/onnxruntime/core/session/onnxruntime_cxx_api.h
Original file line number Diff line number Diff line change
Expand Up @@ -1526,6 +1526,7 @@ struct ArenaCfg : detail::Base<OrtArenaCfg> {
* \param arena_extend_strategy - use -1 to allow ORT to choose the default, 0 = kNextPowerOfTwo, 1 = kSameAsRequested
* \param initial_chunk_size_bytes - use -1 to allow ORT to choose the default
* \param max_dead_bytes_per_chunk - use -1 to allow ORT to choose the default
* \param max_power_of_two_extend_bytes - use -1 to allow ORT to choose the default
* See docs/C_API.md for details on what the following parameters mean and how to choose these values
*/
ArenaCfg(size_t max_mem, int arena_extend_strategy, int initial_chunk_size_bytes, int max_dead_bytes_per_chunk);
Expand Down
2 changes: 1 addition & 1 deletion onnxruntime/contrib_ops/cuda/bert/attention.cc
Original file line number Diff line number Diff line change
Expand Up @@ -164,7 +164,7 @@ Status Attention<T>::ComputeInternal(OpKernelContext* context) const {
has_memory_efficient_attention(sm, sizeof(T) == 2);
#else
constexpr bool use_memory_efficient_attention = false;
ORT_UNUSED_VARIABLE(is_mask_1d_key_seq_len_start);
ORT_UNUSED_PARAMETER(is_mask_1d_key_seq_len_start);
#endif

cublasHandle_t cublas = GetCublasHandle(context);
Expand Down
1 change: 1 addition & 0 deletions onnxruntime/contrib_ops/cuda/bert/multihead_attention.cc
Original file line number Diff line number Diff line change
Expand Up @@ -182,6 +182,7 @@ Status MultiHeadAttention<T>::ComputeInternal(OpKernelContext* context) const {
has_memory_efficient_attention(sm, sizeof(T) == 2);
#else
constexpr bool use_memory_efficient_attention = false;
ORT_UNUSED_PARAMETER(is_mask_1d_key_seq_len_start);
#endif

// When packed kv or packed qkv is used, there is no needed for add bias transpose thus no qkv workspace.
Expand Down
6 changes: 5 additions & 1 deletion onnxruntime/core/framework/allocatormgr.cc
Original file line number Diff line number Diff line change
Expand Up @@ -42,6 +42,9 @@ AllocatorPtr CreateAllocator(const AllocatorCreationInfo& info) {
int initial_growth_chunk_size_bytes = info.arena_cfg.initial_growth_chunk_size_bytes == -1
? BFCArena::DEFAULT_INITIAL_GROWTH_CHUNK_SIZE_BYTES
: info.arena_cfg.initial_growth_chunk_size_bytes;
int64_t max_power_of_two_extend_bytes = info.arena_cfg.max_power_of_two_extend_bytes == -1
? BFCArena::DEFAULT_MAX_POWER_OF_TWO_EXTEND_BYTES
: info.arena_cfg.max_power_of_two_extend_bytes;
ArenaExtendStrategy arena_extend_str;
switch (info.arena_cfg.arena_extend_strategy) {
case static_cast<int>(ArenaExtendStrategy::kSameAsRequested):
Expand Down Expand Up @@ -77,7 +80,8 @@ AllocatorPtr CreateAllocator(const AllocatorCreationInfo& info) {
arena_extend_str,
initial_chunk_size_bytes,
max_dead_bytes_per_chunk,
initial_growth_chunk_size_bytes));
initial_growth_chunk_size_bytes,
max_power_of_two_extend_bytes));
}
} else {
return device_allocator;
Expand Down
2 changes: 1 addition & 1 deletion onnxruntime/core/framework/allocatormgr.h
Original file line number Diff line number Diff line change
Expand Up @@ -18,7 +18,7 @@ struct AllocatorCreationInfo {
AllocatorCreationInfo(AllocatorFactory device_alloc_factory,
OrtDevice::DeviceId device_id = 0,
bool use_arena = true,
OrtArenaCfg arena_cfg = {0, -1, -1, -1, -1},
OrtArenaCfg arena_cfg = {0, -1, -1, -1, -1, -1L},
bool stream_aware_arena = false,
bool cross_stream_reusing = false)
: device_alloc_factory(device_alloc_factory),
Expand Down
30 changes: 20 additions & 10 deletions onnxruntime/core/framework/bfc_arena.cc
Original file line number Diff line number Diff line change
Expand Up @@ -11,7 +11,8 @@ BFCArena::BFCArena(std::unique_ptr<IAllocator> resource_allocator,
ArenaExtendStrategy arena_extend_strategy,
int initial_chunk_size_bytes,
int max_dead_bytes_per_chunk,
int initial_growth_chunk_size_bytes)
int initial_growth_chunk_size_bytes,
int64_t max_power_of_two_extend_bytes)
: IAllocator(OrtMemoryInfo(resource_allocator->Info().name,
OrtAllocatorType::OrtArenaAllocator,
resource_allocator->Info().device,
Expand All @@ -23,11 +24,13 @@ BFCArena::BFCArena(std::unique_ptr<IAllocator> resource_allocator,
next_allocation_id_(1),
initial_chunk_size_bytes_(initial_chunk_size_bytes),
max_dead_bytes_per_chunk_(max_dead_bytes_per_chunk),
initial_growth_chunk_size_bytes_(initial_growth_chunk_size_bytes) {
initial_growth_chunk_size_bytes_(initial_growth_chunk_size_bytes),
max_power_of_two_extend_bytes_(max_power_of_two_extend_bytes) {
LOGS_DEFAULT(INFO) << "Creating BFCArena for " << device_allocator_->Info().name
<< " with following configs: initial_chunk_size_bytes: " << initial_chunk_size_bytes_
<< " max_dead_bytes_per_chunk: " << max_dead_bytes_per_chunk_
<< " initial_growth_chunk_size_bytes: " << initial_growth_chunk_size_bytes_
<< " max_power_of_two_extend_bytes: " << max_power_of_two_extend_bytes_
<< " memory limit: " << total_memory
<< " arena_extend_strategy: " << static_cast<int32_t>(arena_extend_strategy);

Expand Down Expand Up @@ -144,7 +147,12 @@ Status BFCArena::Extend(size_t rounded_bytes) {
// we allocated the same number of bytes as the current region
// the 2x is to double the minimum size of the next amount we'll allocate
if (!increased_allocation) {
curr_region_allocation_bytes_ *= 2;
if (arena_extend_strategy_ == ArenaExtendStrategy::kNextPowerOfTwo &&
curr_region_allocation_bytes_ * 2 < max_power_of_two_extend_bytes_) {
curr_region_allocation_bytes_ *= 2;
} else {
curr_region_allocation_bytes_ = max_power_of_two_extend_bytes_;
}
}
} else if (arena_extend_strategy_ == ArenaExtendStrategy::kSameAsRequested) {
// BFC Arena could cause internal and external fragmentation. But, running training with
Expand Down Expand Up @@ -847,13 +855,15 @@ StreamAwareArena::StreamAwareArena(std::unique_ptr<IAllocator> resource_allocato
ArenaExtendStrategy arena_extend_strategy,
int initial_chunk_size_bytes,
int max_dead_bytes_per_chunk,
int initial_growth_chunk_size_bytes) : BFCArena(std::move(resource_allocator),
total_memory,
arena_extend_strategy,
initial_chunk_size_bytes,
max_dead_bytes_per_chunk,
initial_growth_chunk_size_bytes),
enable_cross_stream_reusing_(enable_cross_stream_sharing) {
int initial_growth_chunk_size_bytes,
int64_t max_power_of_two_extend_bytes) : BFCArena(std::move(resource_allocator),
total_memory,
arena_extend_strategy,
initial_chunk_size_bytes,
max_dead_bytes_per_chunk,
initial_growth_chunk_size_bytes,
max_power_of_two_extend_bytes),
enable_cross_stream_reusing_(enable_cross_stream_sharing) {
arena_type_ = ArenaType::StreamAwareArena;
}

Expand Down
8 changes: 6 additions & 2 deletions onnxruntime/core/framework/bfc_arena.h
Original file line number Diff line number Diff line change
Expand Up @@ -59,6 +59,7 @@ class BFCArena : public IAllocator {
static const int DEFAULT_INITIAL_CHUNK_SIZE_BYTES = 1 * 1024 * 1024;
static const int DEFAULT_MAX_DEAD_BYTES_PER_CHUNK = 128 * 1024 * 1024;
static const int DEFAULT_INITIAL_GROWTH_CHUNK_SIZE_BYTES = 2 * 1024 * 1024;
static const int64_t DEFAULT_MAX_POWER_OF_TWO_EXTEND_BYTES = 1024 * 1024 * 1024; // 1GB
static const size_t DEFAULT_MAX_MEM = std::numeric_limits<size_t>::max();

enum ArenaType {
Expand All @@ -71,7 +72,8 @@ class BFCArena : public IAllocator {
ArenaExtendStrategy arena_extend_strategy = DEFAULT_ARENA_EXTEND_STRATEGY,
int initial_chunk_size_bytes = DEFAULT_INITIAL_CHUNK_SIZE_BYTES,
int max_dead_bytes_per_chunk = DEFAULT_MAX_DEAD_BYTES_PER_CHUNK,
int initial_growth_chunk_size_bytes = DEFAULT_INITIAL_GROWTH_CHUNK_SIZE_BYTES);
int initial_growth_chunk_size_bytes = DEFAULT_INITIAL_GROWTH_CHUNK_SIZE_BYTES,
int64_t max_power_of_two_extend_bytes = DEFAULT_MAX_POWER_OF_TWO_EXTEND_BYTES);

~BFCArena() override;

Expand Down Expand Up @@ -505,6 +507,7 @@ class BFCArena : public IAllocator {
const int initial_chunk_size_bytes_;
const int max_dead_bytes_per_chunk_;
const int initial_growth_chunk_size_bytes_;
const int64_t max_power_of_two_extend_bytes_;

// This flag is only relevant if Shrink() is invoked.
// This is a boolean flag that controls whether the first allocation region
Expand All @@ -522,7 +525,8 @@ class StreamAwareArena : public BFCArena {
ArenaExtendStrategy arena_extend_strategy = DEFAULT_ARENA_EXTEND_STRATEGY,
int initial_chunk_size_bytes = DEFAULT_INITIAL_CHUNK_SIZE_BYTES,
int max_dead_bytes_per_chunk = DEFAULT_MAX_DEAD_BYTES_PER_CHUNK,
int initial_growth_chunk_size_bytes = DEFAULT_INITIAL_GROWTH_CHUNK_SIZE_BYTES);
int initial_growth_chunk_size_bytes = DEFAULT_INITIAL_GROWTH_CHUNK_SIZE_BYTES,
int64_t max_power_of_two_extend_bytes = DEFAULT_MAX_POWER_OF_TWO_EXTEND_BYTES);

// If size is 0, then this function returns either NULL,
// or a unique pointer value that can later be successfully
Expand Down
2 changes: 1 addition & 1 deletion onnxruntime/core/providers/cuda/cuda_execution_provider.cc
Original file line number Diff line number Diff line change
Expand Up @@ -149,7 +149,7 @@ AllocatorPtr CUDAExecutionProvider::CreateCudaAllocator(OrtDevice::DeviceId devi
device_id,
true,
{default_memory_arena_cfg ? *default_memory_arena_cfg
: OrtArenaCfg(gpu_mem_limit, static_cast<int>(arena_extend_strategy), -1, -1, -1)},
: OrtArenaCfg(gpu_mem_limit, static_cast<int>(arena_extend_strategy), -1, -1, -1, -1L)},
// make it stream aware
true,
// enable cross stream sharing?
Expand Down
4 changes: 3 additions & 1 deletion onnxruntime/core/session/environment.cc
Original file line number Diff line number Diff line change
Expand Up @@ -135,6 +135,7 @@ Status Environment::CreateAndRegisterAllocator(const OrtMemoryInfo& mem_info, co
int initial_chunk_size_bytes = -1;
int max_dead_bytes_per_chunk = -1;
int initial_growth_chunk_size_bytes = -1;
int64_t max_power_of_two_extend_bytes = -1L;

// override with values from the user supplied arena_cfg object
if (arena_cfg) {
Expand All @@ -151,10 +152,11 @@ Status Environment::CreateAndRegisterAllocator(const OrtMemoryInfo& mem_info, co
initial_chunk_size_bytes = arena_cfg->initial_chunk_size_bytes;
max_dead_bytes_per_chunk = arena_cfg->max_dead_bytes_per_chunk;
initial_growth_chunk_size_bytes = arena_cfg->initial_growth_chunk_size_bytes;
max_power_of_two_extend_bytes = arena_cfg->max_power_of_two_extend_bytes;
}

OrtArenaCfg l_arena_cfg{max_mem, arena_extend_strategy, initial_chunk_size_bytes, max_dead_bytes_per_chunk,
initial_growth_chunk_size_bytes};
initial_growth_chunk_size_bytes, max_power_of_two_extend_bytes};
AllocatorCreationInfo alloc_creation_info{
[mem_info](int) { return std::make_unique<CPUAllocator>(mem_info); },
0,
Expand Down
3 changes: 3 additions & 0 deletions onnxruntime/core/session/onnxruntime_c_api.cc
Original file line number Diff line number Diff line change
Expand Up @@ -2233,6 +2233,7 @@ ORT_API_STATUS_IMPL(OrtApis::CreateArenaCfg, _In_ size_t max_mem, int arena_exte
cfg->arena_extend_strategy = arena_extend_strategy;
cfg->initial_chunk_size_bytes = initial_chunk_size_bytes;
cfg->max_dead_bytes_per_chunk = max_dead_bytes_per_chunk;
cfg->max_dead_bytes_per_chunk = -1L;
*out = cfg.release();
return nullptr;
API_IMPL_END
Expand All @@ -2254,6 +2255,8 @@ ORT_API_STATUS_IMPL(OrtApis::CreateArenaCfgV2, _In_reads_(num_keys) const char*
cfg->max_dead_bytes_per_chunk = static_cast<int>(arena_config_values[i]);
} else if (strcmp(arena_config_keys[i], "initial_growth_chunk_size_bytes") == 0) {
cfg->initial_growth_chunk_size_bytes = static_cast<int>(arena_config_values[i]);
} else if (strcmp(arena_config_keys[i], "max_power_of_two_extend_bytes") == 0) {
cfg->max_power_of_two_extend_bytes = static_cast<int64_t>(arena_config_values[i]);
} else {
std::ostringstream oss;
oss << "Invalid key found: " << arena_config_keys[i];
Expand Down
9 changes: 6 additions & 3 deletions onnxruntime/python/onnxruntime_pybind_state.cc
Original file line number Diff line number Diff line change
Expand Up @@ -1148,8 +1148,8 @@ void addObjectMethods(py::module& m, ExecutionProviderRegistrationFn ep_registra
.def_static("default_memory", []() { return OrtDevice::MemType::DEFAULT; });

py::class_<OrtArenaCfg> ort_arena_cfg_binding(m, "OrtArenaCfg");
// Note: Doesn't expose initial_growth_chunk_sizes_bytes option. This constructor kept for
// backwards compatibility, key-value pair constructor overload exposes all options
// Note: Doesn't expose initial_growth_chunk_sizes_bytes/max_power_of_two_extend_bytes option.
// This constructor kept for backwards compatibility, key-value pair constructor overload exposes all options
// There is a global var: arena_extend_strategy, which means we can't use that var name here
// See docs/C_API.md for details on what the following parameters mean and how to choose these values
ort_arena_cfg_binding.def(py::init([](size_t max_mem, int arena_extend_strategy_local,
Expand All @@ -1175,6 +1175,8 @@ void addObjectMethods(py::module& m, ExecutionProviderRegistrationFn ep_registra
ort_arena_cfg->max_dead_bytes_per_chunk = kvp.second.cast<int>();
} else if (key == "initial_growth_chunk_size_bytes") {
ort_arena_cfg->initial_growth_chunk_size_bytes = kvp.second.cast<int>();
} else if (key == "max_power_of_two_extend_bytes") {
ort_arena_cfg->max_power_of_two_extend_bytes = kvp.second.cast<int>();
} else {
ORT_THROW("Invalid OrtArenaCfg option: ", key);
}
Expand All @@ -1185,7 +1187,8 @@ void addObjectMethods(py::module& m, ExecutionProviderRegistrationFn ep_registra
.def_readwrite("arena_extend_strategy", &OrtArenaCfg::arena_extend_strategy)
.def_readwrite("initial_chunk_size_bytes", &OrtArenaCfg::initial_chunk_size_bytes)
.def_readwrite("max_dead_bytes_per_chunk", &OrtArenaCfg::max_dead_bytes_per_chunk)
.def_readwrite("initial_growth_chunk_size_bytes", &OrtArenaCfg::initial_growth_chunk_size_bytes);
.def_readwrite("initial_growth_chunk_size_bytes", &OrtArenaCfg::initial_growth_chunk_size_bytes)
.def_readwrite("max_power_of_two_extend_bytes", &OrtArenaCfg::max_power_of_two_extend_bytes);

py::class_<OrtMemoryInfo> ort_memory_info_binding(m, "OrtMemoryInfo");
ort_memory_info_binding.def(py::init([](const char* name, OrtAllocatorType type, int id, OrtMemType mem_type) {
Expand Down
Loading