diff --git a/cmake/CMakeLists.txt b/cmake/CMakeLists.txt index e4d1a85b05ae4..274b0fdf5ca36 100644 --- a/cmake/CMakeLists.txt +++ b/cmake/CMakeLists.txt @@ -805,7 +805,7 @@ function(onnxruntime_set_compile_flags target_name) set_target_properties(${target_name} PROPERTIES COMPILE_WARNING_AS_ERROR ON) if (onnxruntime_USE_CUDA) # Suppress a "conversion_function_not_usable" warning in gsl/span - target_compile_options(${target_name} PRIVATE "$<$:SHELL:--diag-suppress 554>") + target_compile_options(${target_name} PRIVATE "$<$:SHELL:-Xcudafe \"--diag_suppress=conversion_function_not_usable\">") endif() if (MSVC) foreach(CMAKE_CUDA_TOOLKIT_INCLUDE_DIRECTORY ${CMAKE_CUDA_TOOLKIT_INCLUDE_DIRECTORIES}) @@ -852,7 +852,7 @@ function(onnxruntime_set_compile_flags target_name) target_compile_options(${target_name} PRIVATE "$<$:SHELL:--compiler-options /analyze:external->" "$<$:/analyze:external->") target_compile_options(${target_name} PRIVATE "$<$:SHELL:--compiler-options /wd6385>" ) # There are many such warnings from STL: - # include\list(148): warning C6011: Dereferencing NULL pointer '_Mycont'. : Lines: 146, 147, 148 + # include\list(148): warning C6011: Dereferencing NULL pointer '_Mycont'. : Lines: 146, 147, 148 target_compile_options(${target_name} PRIVATE "$<$:SHELL:--compiler-options /wd6011>" ) endif() else() @@ -920,7 +920,7 @@ endfunction() function(onnxruntime_configure_target target_name) target_link_directories(${target_name} PRIVATE ${onnxruntime_LINK_DIRS}) onnxruntime_set_compile_flags(${target_name}) - onnxruntime_set_source_file_properties(${target_name}) + onnxruntime_set_source_file_properties(${target_name}) if(WIN32 AND onnxruntime_ENABLE_STATIC_ANALYSIS AND onnxruntime_USE_CUSTOM_STATIC_ANALYSIS_RULES) set_target_properties(${target_name} PROPERTIES VS_USER_PROPS ${PROJECT_SOURCE_DIR}/EnableVisualStudioCodeAnalysis.props) endif() diff --git a/include/onnxruntime/core/framework/allocator.h b/include/onnxruntime/core/framework/allocator.h index de3b35f305921..938c286879b71 100644 --- a/include/onnxruntime/core/framework/allocator.h +++ b/include/onnxruntime/core/framework/allocator.h @@ -16,20 +16,24 @@ struct OrtArenaCfg { arena_extend_strategy(-1), initial_chunk_size_bytes(-1), max_dead_bytes_per_chunk(-1), - initial_growth_chunk_size_bytes(-1) {} + initial_growth_chunk_size_bytes(-1), + max_power_of_two_extend_bytes(-1) {} OrtArenaCfg(size_t max_mem, int arena_extend_strategy, int initial_chunk_size_bytes, - int max_dead_bytes_per_chunk, int initial_growth_chunk_size_bytes) + int max_dead_bytes_per_chunk, int initial_growth_chunk_size_bytes, + int64_t max_power_of_two_extend_bytes) : max_mem(max_mem), arena_extend_strategy(arena_extend_strategy), initial_chunk_size_bytes(initial_chunk_size_bytes), max_dead_bytes_per_chunk(max_dead_bytes_per_chunk), - initial_growth_chunk_size_bytes(initial_growth_chunk_size_bytes) {} - - size_t max_mem; // use 0 to allow ORT to choose the default - int arena_extend_strategy; // use -1 to allow ORT to choose the default, 0 = kNextPowerOfTwo, 1 = kSameAsRequested - int initial_chunk_size_bytes; // use -1 to allow ORT to choose the default - int max_dead_bytes_per_chunk; // use -1 to allow ORT to choose the default - int initial_growth_chunk_size_bytes; // use -1 to allow ORT to choose the default + initial_growth_chunk_size_bytes(initial_growth_chunk_size_bytes), + max_power_of_two_extend_bytes(max_power_of_two_extend_bytes) {} + + size_t max_mem; // use 0 to allow ORT to choose the default + int arena_extend_strategy; // use -1 to allow ORT to choose the default, 0 = kNextPowerOfTwo, 1 = kSameAsRequested + int initial_chunk_size_bytes; // use -1 to allow ORT to choose the default + int max_dead_bytes_per_chunk; // use -1 to allow ORT to choose the default + int initial_growth_chunk_size_bytes; // use -1 to allow ORT to choose the default + int64_t max_power_of_two_extend_bytes; // use -1 to allow ORT to choose the default }; namespace onnxruntime { diff --git a/include/onnxruntime/core/session/onnxruntime_c_api.h b/include/onnxruntime/core/session/onnxruntime_c_api.h index 854e65a6dccc7..10c63e67fe2cc 100644 --- a/include/onnxruntime/core/session/onnxruntime_c_api.h +++ b/include/onnxruntime/core/session/onnxruntime_c_api.h @@ -2728,6 +2728,10 @@ struct OrtApi { * crossing which the current chunk is chunked into 2. * "initial_growth_chunk_size_bytes": (Possible) Size of the second allocation in the arena. * Only relevant if arena strategy is `kNextPowerOfTwo`. Use -1 to allow ORT to choose the default. + * "max_power_of_two_extend_bytes": The maximum enxtend size if arena strategy is `kNextPowerOfTwo`. + * It is not an allocation limit, it is only a limit for extention when requested byte is less than the limit. + * When requested bytes is more than the limit, allocator will still return as requested. + * Use -1 to allow ORT to choose the default 1GB for max_power_of_two_extend_bytes. * Ultimately, the allocation size is determined by the allocation memory request. * Further allocation sizes are governed by the arena extend strategy. * diff --git a/include/onnxruntime/core/session/onnxruntime_cxx_api.h b/include/onnxruntime/core/session/onnxruntime_cxx_api.h index 2d5e1a9bddeec..949b2a91d27d9 100644 --- a/include/onnxruntime/core/session/onnxruntime_cxx_api.h +++ b/include/onnxruntime/core/session/onnxruntime_cxx_api.h @@ -1526,6 +1526,7 @@ struct ArenaCfg : detail::Base { * \param arena_extend_strategy - use -1 to allow ORT to choose the default, 0 = kNextPowerOfTwo, 1 = kSameAsRequested * \param initial_chunk_size_bytes - use -1 to allow ORT to choose the default * \param max_dead_bytes_per_chunk - use -1 to allow ORT to choose the default + * \param max_power_of_two_extend_bytes - use -1 to allow ORT to choose the default * See docs/C_API.md for details on what the following parameters mean and how to choose these values */ ArenaCfg(size_t max_mem, int arena_extend_strategy, int initial_chunk_size_bytes, int max_dead_bytes_per_chunk); diff --git a/onnxruntime/contrib_ops/cuda/bert/attention.cc b/onnxruntime/contrib_ops/cuda/bert/attention.cc index bd1498c0f9de7..ada7bc5fa2597 100644 --- a/onnxruntime/contrib_ops/cuda/bert/attention.cc +++ b/onnxruntime/contrib_ops/cuda/bert/attention.cc @@ -164,7 +164,7 @@ Status Attention::ComputeInternal(OpKernelContext* context) const { has_memory_efficient_attention(sm, sizeof(T) == 2); #else constexpr bool use_memory_efficient_attention = false; - ORT_UNUSED_VARIABLE(is_mask_1d_key_seq_len_start); + ORT_UNUSED_PARAMETER(is_mask_1d_key_seq_len_start); #endif cublasHandle_t cublas = GetCublasHandle(context); diff --git a/onnxruntime/contrib_ops/cuda/bert/multihead_attention.cc b/onnxruntime/contrib_ops/cuda/bert/multihead_attention.cc index 9d68ad7d7d7fa..490c7275c7412 100644 --- a/onnxruntime/contrib_ops/cuda/bert/multihead_attention.cc +++ b/onnxruntime/contrib_ops/cuda/bert/multihead_attention.cc @@ -182,6 +182,7 @@ Status MultiHeadAttention::ComputeInternal(OpKernelContext* context) const { has_memory_efficient_attention(sm, sizeof(T) == 2); #else constexpr bool use_memory_efficient_attention = false; + ORT_UNUSED_PARAMETER(is_mask_1d_key_seq_len_start); #endif // When packed kv or packed qkv is used, there is no needed for add bias transpose thus no qkv workspace. diff --git a/onnxruntime/core/framework/allocatormgr.cc b/onnxruntime/core/framework/allocatormgr.cc index 192206fbd2206..44e0e9c4e8ad2 100644 --- a/onnxruntime/core/framework/allocatormgr.cc +++ b/onnxruntime/core/framework/allocatormgr.cc @@ -42,6 +42,9 @@ AllocatorPtr CreateAllocator(const AllocatorCreationInfo& info) { int initial_growth_chunk_size_bytes = info.arena_cfg.initial_growth_chunk_size_bytes == -1 ? BFCArena::DEFAULT_INITIAL_GROWTH_CHUNK_SIZE_BYTES : info.arena_cfg.initial_growth_chunk_size_bytes; + int64_t max_power_of_two_extend_bytes = info.arena_cfg.max_power_of_two_extend_bytes == -1 + ? BFCArena::DEFAULT_MAX_POWER_OF_TWO_EXTEND_BYTES + : info.arena_cfg.max_power_of_two_extend_bytes; ArenaExtendStrategy arena_extend_str; switch (info.arena_cfg.arena_extend_strategy) { case static_cast(ArenaExtendStrategy::kSameAsRequested): @@ -77,7 +80,8 @@ AllocatorPtr CreateAllocator(const AllocatorCreationInfo& info) { arena_extend_str, initial_chunk_size_bytes, max_dead_bytes_per_chunk, - initial_growth_chunk_size_bytes)); + initial_growth_chunk_size_bytes, + max_power_of_two_extend_bytes)); } } else { return device_allocator; diff --git a/onnxruntime/core/framework/allocatormgr.h b/onnxruntime/core/framework/allocatormgr.h index 7a92b80c674d4..56ea4e443ff40 100644 --- a/onnxruntime/core/framework/allocatormgr.h +++ b/onnxruntime/core/framework/allocatormgr.h @@ -18,7 +18,7 @@ struct AllocatorCreationInfo { AllocatorCreationInfo(AllocatorFactory device_alloc_factory, OrtDevice::DeviceId device_id = 0, bool use_arena = true, - OrtArenaCfg arena_cfg = {0, -1, -1, -1, -1}, + OrtArenaCfg arena_cfg = {0, -1, -1, -1, -1, -1L}, bool stream_aware_arena = false, bool cross_stream_reusing = false) : device_alloc_factory(device_alloc_factory), diff --git a/onnxruntime/core/framework/bfc_arena.cc b/onnxruntime/core/framework/bfc_arena.cc index 097e8e9eadad2..13f9656ae0595 100644 --- a/onnxruntime/core/framework/bfc_arena.cc +++ b/onnxruntime/core/framework/bfc_arena.cc @@ -11,7 +11,8 @@ BFCArena::BFCArena(std::unique_ptr resource_allocator, ArenaExtendStrategy arena_extend_strategy, int initial_chunk_size_bytes, int max_dead_bytes_per_chunk, - int initial_growth_chunk_size_bytes) + int initial_growth_chunk_size_bytes, + int64_t max_power_of_two_extend_bytes) : IAllocator(OrtMemoryInfo(resource_allocator->Info().name, OrtAllocatorType::OrtArenaAllocator, resource_allocator->Info().device, @@ -23,11 +24,13 @@ BFCArena::BFCArena(std::unique_ptr resource_allocator, next_allocation_id_(1), initial_chunk_size_bytes_(initial_chunk_size_bytes), max_dead_bytes_per_chunk_(max_dead_bytes_per_chunk), - initial_growth_chunk_size_bytes_(initial_growth_chunk_size_bytes) { + initial_growth_chunk_size_bytes_(initial_growth_chunk_size_bytes), + max_power_of_two_extend_bytes_(max_power_of_two_extend_bytes) { LOGS_DEFAULT(INFO) << "Creating BFCArena for " << device_allocator_->Info().name << " with following configs: initial_chunk_size_bytes: " << initial_chunk_size_bytes_ << " max_dead_bytes_per_chunk: " << max_dead_bytes_per_chunk_ << " initial_growth_chunk_size_bytes: " << initial_growth_chunk_size_bytes_ + << " max_power_of_two_extend_bytes: " << max_power_of_two_extend_bytes_ << " memory limit: " << total_memory << " arena_extend_strategy: " << static_cast(arena_extend_strategy); @@ -144,7 +147,12 @@ Status BFCArena::Extend(size_t rounded_bytes) { // we allocated the same number of bytes as the current region // the 2x is to double the minimum size of the next amount we'll allocate if (!increased_allocation) { - curr_region_allocation_bytes_ *= 2; + if (arena_extend_strategy_ == ArenaExtendStrategy::kNextPowerOfTwo && + curr_region_allocation_bytes_ * 2 < max_power_of_two_extend_bytes_) { + curr_region_allocation_bytes_ *= 2; + } else { + curr_region_allocation_bytes_ = max_power_of_two_extend_bytes_; + } } } else if (arena_extend_strategy_ == ArenaExtendStrategy::kSameAsRequested) { // BFC Arena could cause internal and external fragmentation. But, running training with @@ -847,13 +855,15 @@ StreamAwareArena::StreamAwareArena(std::unique_ptr resource_allocato ArenaExtendStrategy arena_extend_strategy, int initial_chunk_size_bytes, int max_dead_bytes_per_chunk, - int initial_growth_chunk_size_bytes) : BFCArena(std::move(resource_allocator), - total_memory, - arena_extend_strategy, - initial_chunk_size_bytes, - max_dead_bytes_per_chunk, - initial_growth_chunk_size_bytes), - enable_cross_stream_reusing_(enable_cross_stream_sharing) { + int initial_growth_chunk_size_bytes, + int64_t max_power_of_two_extend_bytes) : BFCArena(std::move(resource_allocator), + total_memory, + arena_extend_strategy, + initial_chunk_size_bytes, + max_dead_bytes_per_chunk, + initial_growth_chunk_size_bytes, + max_power_of_two_extend_bytes), + enable_cross_stream_reusing_(enable_cross_stream_sharing) { arena_type_ = ArenaType::StreamAwareArena; } diff --git a/onnxruntime/core/framework/bfc_arena.h b/onnxruntime/core/framework/bfc_arena.h index 311a9ae7869f0..e16b90ded3381 100644 --- a/onnxruntime/core/framework/bfc_arena.h +++ b/onnxruntime/core/framework/bfc_arena.h @@ -59,6 +59,7 @@ class BFCArena : public IAllocator { static const int DEFAULT_INITIAL_CHUNK_SIZE_BYTES = 1 * 1024 * 1024; static const int DEFAULT_MAX_DEAD_BYTES_PER_CHUNK = 128 * 1024 * 1024; static const int DEFAULT_INITIAL_GROWTH_CHUNK_SIZE_BYTES = 2 * 1024 * 1024; + static const int64_t DEFAULT_MAX_POWER_OF_TWO_EXTEND_BYTES = 1024 * 1024 * 1024; // 1GB static const size_t DEFAULT_MAX_MEM = std::numeric_limits::max(); enum ArenaType { @@ -71,7 +72,8 @@ class BFCArena : public IAllocator { ArenaExtendStrategy arena_extend_strategy = DEFAULT_ARENA_EXTEND_STRATEGY, int initial_chunk_size_bytes = DEFAULT_INITIAL_CHUNK_SIZE_BYTES, int max_dead_bytes_per_chunk = DEFAULT_MAX_DEAD_BYTES_PER_CHUNK, - int initial_growth_chunk_size_bytes = DEFAULT_INITIAL_GROWTH_CHUNK_SIZE_BYTES); + int initial_growth_chunk_size_bytes = DEFAULT_INITIAL_GROWTH_CHUNK_SIZE_BYTES, + int64_t max_power_of_two_extend_bytes = DEFAULT_MAX_POWER_OF_TWO_EXTEND_BYTES); ~BFCArena() override; @@ -505,6 +507,7 @@ class BFCArena : public IAllocator { const int initial_chunk_size_bytes_; const int max_dead_bytes_per_chunk_; const int initial_growth_chunk_size_bytes_; + const int64_t max_power_of_two_extend_bytes_; // This flag is only relevant if Shrink() is invoked. // This is a boolean flag that controls whether the first allocation region @@ -522,7 +525,8 @@ class StreamAwareArena : public BFCArena { ArenaExtendStrategy arena_extend_strategy = DEFAULT_ARENA_EXTEND_STRATEGY, int initial_chunk_size_bytes = DEFAULT_INITIAL_CHUNK_SIZE_BYTES, int max_dead_bytes_per_chunk = DEFAULT_MAX_DEAD_BYTES_PER_CHUNK, - int initial_growth_chunk_size_bytes = DEFAULT_INITIAL_GROWTH_CHUNK_SIZE_BYTES); + int initial_growth_chunk_size_bytes = DEFAULT_INITIAL_GROWTH_CHUNK_SIZE_BYTES, + int64_t max_power_of_two_extend_bytes = DEFAULT_MAX_POWER_OF_TWO_EXTEND_BYTES); // If size is 0, then this function returns either NULL, // or a unique pointer value that can later be successfully diff --git a/onnxruntime/core/providers/cuda/cuda_execution_provider.cc b/onnxruntime/core/providers/cuda/cuda_execution_provider.cc index c0acd365c1374..74424371e6abc 100644 --- a/onnxruntime/core/providers/cuda/cuda_execution_provider.cc +++ b/onnxruntime/core/providers/cuda/cuda_execution_provider.cc @@ -149,7 +149,7 @@ AllocatorPtr CUDAExecutionProvider::CreateCudaAllocator(OrtDevice::DeviceId devi device_id, true, {default_memory_arena_cfg ? *default_memory_arena_cfg - : OrtArenaCfg(gpu_mem_limit, static_cast(arena_extend_strategy), -1, -1, -1)}, + : OrtArenaCfg(gpu_mem_limit, static_cast(arena_extend_strategy), -1, -1, -1, -1L)}, // make it stream aware true, // enable cross stream sharing? diff --git a/onnxruntime/core/session/environment.cc b/onnxruntime/core/session/environment.cc index 3a20dc23116a6..dc225b131db7e 100644 --- a/onnxruntime/core/session/environment.cc +++ b/onnxruntime/core/session/environment.cc @@ -135,6 +135,7 @@ Status Environment::CreateAndRegisterAllocator(const OrtMemoryInfo& mem_info, co int initial_chunk_size_bytes = -1; int max_dead_bytes_per_chunk = -1; int initial_growth_chunk_size_bytes = -1; + int64_t max_power_of_two_extend_bytes = -1L; // override with values from the user supplied arena_cfg object if (arena_cfg) { @@ -151,10 +152,11 @@ Status Environment::CreateAndRegisterAllocator(const OrtMemoryInfo& mem_info, co initial_chunk_size_bytes = arena_cfg->initial_chunk_size_bytes; max_dead_bytes_per_chunk = arena_cfg->max_dead_bytes_per_chunk; initial_growth_chunk_size_bytes = arena_cfg->initial_growth_chunk_size_bytes; + max_power_of_two_extend_bytes = arena_cfg->max_power_of_two_extend_bytes; } OrtArenaCfg l_arena_cfg{max_mem, arena_extend_strategy, initial_chunk_size_bytes, max_dead_bytes_per_chunk, - initial_growth_chunk_size_bytes}; + initial_growth_chunk_size_bytes, max_power_of_two_extend_bytes}; AllocatorCreationInfo alloc_creation_info{ [mem_info](int) { return std::make_unique(mem_info); }, 0, diff --git a/onnxruntime/core/session/onnxruntime_c_api.cc b/onnxruntime/core/session/onnxruntime_c_api.cc index 708cb825a661e..9c909068056a5 100644 --- a/onnxruntime/core/session/onnxruntime_c_api.cc +++ b/onnxruntime/core/session/onnxruntime_c_api.cc @@ -2233,6 +2233,7 @@ ORT_API_STATUS_IMPL(OrtApis::CreateArenaCfg, _In_ size_t max_mem, int arena_exte cfg->arena_extend_strategy = arena_extend_strategy; cfg->initial_chunk_size_bytes = initial_chunk_size_bytes; cfg->max_dead_bytes_per_chunk = max_dead_bytes_per_chunk; + cfg->max_dead_bytes_per_chunk = -1L; *out = cfg.release(); return nullptr; API_IMPL_END @@ -2254,6 +2255,8 @@ ORT_API_STATUS_IMPL(OrtApis::CreateArenaCfgV2, _In_reads_(num_keys) const char* cfg->max_dead_bytes_per_chunk = static_cast(arena_config_values[i]); } else if (strcmp(arena_config_keys[i], "initial_growth_chunk_size_bytes") == 0) { cfg->initial_growth_chunk_size_bytes = static_cast(arena_config_values[i]); + } else if (strcmp(arena_config_keys[i], "max_power_of_two_extend_bytes") == 0) { + cfg->max_power_of_two_extend_bytes = static_cast(arena_config_values[i]); } else { std::ostringstream oss; oss << "Invalid key found: " << arena_config_keys[i]; diff --git a/onnxruntime/python/onnxruntime_pybind_state.cc b/onnxruntime/python/onnxruntime_pybind_state.cc index cf709e508dbf3..fe3bb46c92cf6 100644 --- a/onnxruntime/python/onnxruntime_pybind_state.cc +++ b/onnxruntime/python/onnxruntime_pybind_state.cc @@ -1148,8 +1148,8 @@ void addObjectMethods(py::module& m, ExecutionProviderRegistrationFn ep_registra .def_static("default_memory", []() { return OrtDevice::MemType::DEFAULT; }); py::class_ ort_arena_cfg_binding(m, "OrtArenaCfg"); - // Note: Doesn't expose initial_growth_chunk_sizes_bytes option. This constructor kept for - // backwards compatibility, key-value pair constructor overload exposes all options + // Note: Doesn't expose initial_growth_chunk_sizes_bytes/max_power_of_two_extend_bytes option. + // This constructor kept for backwards compatibility, key-value pair constructor overload exposes all options // There is a global var: arena_extend_strategy, which means we can't use that var name here // See docs/C_API.md for details on what the following parameters mean and how to choose these values ort_arena_cfg_binding.def(py::init([](size_t max_mem, int arena_extend_strategy_local, @@ -1175,6 +1175,8 @@ void addObjectMethods(py::module& m, ExecutionProviderRegistrationFn ep_registra ort_arena_cfg->max_dead_bytes_per_chunk = kvp.second.cast(); } else if (key == "initial_growth_chunk_size_bytes") { ort_arena_cfg->initial_growth_chunk_size_bytes = kvp.second.cast(); + } else if (key == "max_power_of_two_extend_bytes") { + ort_arena_cfg->max_power_of_two_extend_bytes = kvp.second.cast(); } else { ORT_THROW("Invalid OrtArenaCfg option: ", key); } @@ -1185,7 +1187,8 @@ void addObjectMethods(py::module& m, ExecutionProviderRegistrationFn ep_registra .def_readwrite("arena_extend_strategy", &OrtArenaCfg::arena_extend_strategy) .def_readwrite("initial_chunk_size_bytes", &OrtArenaCfg::initial_chunk_size_bytes) .def_readwrite("max_dead_bytes_per_chunk", &OrtArenaCfg::max_dead_bytes_per_chunk) - .def_readwrite("initial_growth_chunk_size_bytes", &OrtArenaCfg::initial_growth_chunk_size_bytes); + .def_readwrite("initial_growth_chunk_size_bytes", &OrtArenaCfg::initial_growth_chunk_size_bytes) + .def_readwrite("max_power_of_two_extend_bytes", &OrtArenaCfg::max_power_of_two_extend_bytes); py::class_ ort_memory_info_binding(m, "OrtMemoryInfo"); ort_memory_info_binding.def(py::init([](const char* name, OrtAllocatorType type, int id, OrtMemType mem_type) { diff --git a/onnxruntime/test/framework/bfc_arena_test.cc b/onnxruntime/test/framework/bfc_arena_test.cc index 584a0a32945b3..a6034f2ea0b79 100644 --- a/onnxruntime/test/framework/bfc_arena_test.cc +++ b/onnxruntime/test/framework/bfc_arena_test.cc @@ -2,6 +2,7 @@ // Licensed under the MIT License. #include "core/framework/bfc_arena.h" +#include "core/framework/allocatormgr.h" #include "gtest/gtest.h" #include "gmock/gmock.h" #include @@ -409,5 +410,70 @@ TEST(StreamAwareArenaTest, TestSecureTheChunk) { a.Free(p2); } +TEST(BFCArenaTest, TestExtendStrategy) { + int64_t extend_delta_bytes = 0; + { + // Use kNextPowerOfTwo strategy with default extension limit: 1GB. + BFCArena a(std::unique_ptr(new CPUAllocator()), 1UL << 30, ArenaExtendStrategy::kNextPowerOfTwo); + size_t block_size = 1 << 20; // 1MB + a.Alloc(block_size); + AllocatorStats stats; + a.GetStats(&stats); + int64_t prev_allocated_bytes = stats.total_allocated_bytes; + extend_delta_bytes = stats.total_allocated_bytes; + ASSERT_EQ(extend_delta_bytes, static_cast(block_size)); + for (int i = 1; i < 256; ++i) { + a.Alloc(block_size); + a.GetStats(&stats); + if (stats.total_allocated_bytes != prev_allocated_bytes) { + int64_t new_delta_bytes = stats.total_allocated_bytes - prev_allocated_bytes; + ASSERT_EQ(new_delta_bytes, 2 * extend_delta_bytes); + extend_delta_bytes = new_delta_bytes; + prev_allocated_bytes = stats.total_allocated_bytes; + } + } + } + int64_t extend_limit = 1 << 25; // 32MB + ASSERT_GT(extend_delta_bytes, extend_limit); + extend_delta_bytes = 0; + { + // Use kNextPowerOfTwo strategy with much smaller extension limit: 32MB. + OrtArenaCfg config(0, 0, -1, -1, -1, extend_limit); + AllocatorCreationInfo device_info{ + [](OrtDevice::DeviceId) { return std::make_unique(); }, + 0, true, config}; + auto allocator = CreateAllocator(device_info); + size_t block_size = 1 << 20; // 1MB + BFCArena& a = *static_cast(allocator.get()); + a.Alloc(block_size); + AllocatorStats stats; + a.GetStats(&stats); + int64_t prev_allocated_bytes = stats.total_allocated_bytes; + extend_delta_bytes = stats.total_allocated_bytes; + ASSERT_EQ(extend_delta_bytes, static_cast(block_size)); + int reach_limit_count = 0; + for (int i = 1; i < 256; ++i) { + a.Alloc(block_size); + a.GetStats(&stats); + if (stats.total_allocated_bytes != prev_allocated_bytes) { + int64_t new_delta_bytes = stats.total_allocated_bytes - prev_allocated_bytes; + if (new_delta_bytes < extend_limit) { + ASSERT_EQ(new_delta_bytes, 2 * extend_delta_bytes) << "index:" << i; + } else { + // The increasing of new chunk reaches the limit. + ++reach_limit_count; + ASSERT_EQ(new_delta_bytes, extend_limit); + } + extend_delta_bytes = new_delta_bytes; + prev_allocated_bytes = stats.total_allocated_bytes; + } + } + ASSERT_GT(reach_limit_count, 2); + // It is OK to allocate more than extend_limit. + ASSERT_NE(a.Alloc(block_size * 64), nullptr); + } + ASSERT_EQ(extend_delta_bytes, extend_limit); +} + } // namespace test } // namespace onnxruntime diff --git a/onnxruntime/test/python/onnxruntime_test_python.py b/onnxruntime/test/python/onnxruntime_test_python.py index 416cdd5ddaff3..b49547a040907 100644 --- a/onnxruntime/test/python/onnxruntime_test_python.py +++ b/onnxruntime/test/python/onnxruntime_test_python.py @@ -1314,7 +1314,7 @@ def testSharedAllocatorUsingCreateAndRegisterAllocator(self): # noqa: N802 # Create and register an arena based allocator # To create an OrtArenaCfg using non-default parameters, use one of below templates: - # ort_arena_cfg = onnxrt.OrtArenaCfg(0, -1, -1, -1) - Note: doesn't expose initial_growth_chunk_size_bytes option + # ort_arena_cfg = onnxrt.OrtArenaCfg(0, -1, -1, -1) - Note: doesn't expose initial_growth_chunk_size_bytes/max_power_of_two_extend_bytes option # ort_arena_cfg = onnxrt.OrtArenaCfg({"max_mem": -1, ""arena_extend_strategy": 1, etc..}) ort_memory_info = onnxrt.OrtMemoryInfo( "Cpu", @@ -1482,6 +1482,8 @@ def verify_allocator(allocator, expected_config): self.assertEqual(allocator.max_dead_bytes_per_chunk, val) elif key == "initial_growth_chunk_size_bytes": self.assertEqual(allocator.initial_growth_chunk_size_bytes, val) + elif key == "max_power_of_two_extend_bytes": + self.assertEqual(allocator.max_power_of_two_extend_bytes, val) else: raise ValueError("Invalid OrtArenaCfg option: " + key) @@ -1506,6 +1508,18 @@ def verify_allocator(allocator, expected_config): ort_arena_cfg_kvp = onnxrt.OrtArenaCfg(expected_kvp_allocator) verify_allocator(ort_arena_cfg_kvp, expected_kvp_allocator) + # Verify key-value pair initialization + expected_kvp_allocator = { + "max_mem": 32, + "arena_extend_strategy": 11, + "initial_chunk_size_bytes": 18, + "max_dead_bytes_per_chunk": 14, + "initial_growth_chunk_size_bytes": 12, + "max_power_of_two_extend_bytes": 17, + } + ort_arena_cfg_kvp = onnxrt.OrtArenaCfg(expected_kvp_allocator) + verify_allocator(ort_arena_cfg_kvp, expected_kvp_allocator) + if __name__ == "__main__": unittest.main(verbosity=1) diff --git a/onnxruntime/test/shared_lib/test_inference.cc b/onnxruntime/test/shared_lib/test_inference.cc index f53a93225b524..8049d7baddb6c 100644 --- a/onnxruntime/test/shared_lib/test_inference.cc +++ b/onnxruntime/test/shared_lib/test_inference.cc @@ -2499,8 +2499,8 @@ TEST(CApiTest, ConfigureCudaArenaAndDemonstrateMemoryArenaShrinkage) { Ort::SessionOptions session_options; - const char* keys[] = {"max_mem", "arena_extend_strategy", "initial_chunk_size_bytes", "max_dead_bytes_per_chunk", "initial_growth_chunk_size_bytes"}; - const size_t values[] = {0 /*let ort pick default max memory*/, 0, 1024, 0, 256}; + const char* keys[] = {"max_mem", "arena_extend_strategy", "initial_chunk_size_bytes", "max_dead_bytes_per_chunk", "initial_growth_chunk_size_bytes", "max_power_of_two_extend_bytes"}; + const size_t values[] = {0 /*let ort pick default max memory*/, 0, 1024, 0, 256, 1L << 24}; OrtArenaCfg* arena_cfg = nullptr; ASSERT_TRUE(api.CreateArenaCfgV2(keys, values, 5, &arena_cfg) == nullptr); diff --git a/onnxruntime/test/testdata/custom_execution_provider_library/my_execution_provider.cc b/onnxruntime/test/testdata/custom_execution_provider_library/my_execution_provider.cc index cf3c2ae39fa46..b0fb0347c7977 100644 --- a/onnxruntime/test/testdata/custom_execution_provider_library/my_execution_provider.cc +++ b/onnxruntime/test/testdata/custom_execution_provider_library/my_execution_provider.cc @@ -19,7 +19,7 @@ MyExecutionProvider::MyExecutionProvider(const MyProviderInfo& info) [](OrtDevice::DeviceId device_id) { return std::make_unique(device_id); }, device_id_, true, - {0, 1, -1, -1, -1}}; + {0, 1, -1, -1, -1, -1L}}; InsertAllocator(CreateAllocator(device_info)); }