diff --git a/cmake/CMakeLists.txt b/cmake/CMakeLists.txt
index e4d1a85b05ae4..274b0fdf5ca36 100644
--- a/cmake/CMakeLists.txt
+++ b/cmake/CMakeLists.txt
@@ -805,7 +805,7 @@ function(onnxruntime_set_compile_flags target_name)
     set_target_properties(${target_name} PROPERTIES COMPILE_WARNING_AS_ERROR ON)
     if (onnxruntime_USE_CUDA)
       # Suppress a "conversion_function_not_usable" warning in gsl/span
-      target_compile_options(${target_name} PRIVATE "$<$<COMPILE_LANGUAGE:CUDA>:SHELL:--diag-suppress 554>")
+      target_compile_options(${target_name} PRIVATE "$<$<COMPILE_LANGUAGE:CUDA>:SHELL:-Xcudafe \"--diag_suppress=conversion_function_not_usable\">")
     endif()
     if (MSVC)
       foreach(CMAKE_CUDA_TOOLKIT_INCLUDE_DIRECTORY ${CMAKE_CUDA_TOOLKIT_INCLUDE_DIRECTORIES})
@@ -852,7 +852,7 @@ function(onnxruntime_set_compile_flags target_name)
         target_compile_options(${target_name} PRIVATE "$<$<COMPILE_LANGUAGE:CUDA>:SHELL:--compiler-options /analyze:external->" "$<$<COMPILE_LANGUAGE:CXX,C>:/analyze:external->")
         target_compile_options(${target_name} PRIVATE "$<$<COMPILE_LANGUAGE:CUDA>:SHELL:--compiler-options /wd6385>" )
         # There are many such warnings from STL:
-        # include\list(148): warning C6011: Dereferencing NULL pointer '_Mycont'. : Lines: 146, 147, 148 
+        # include\list(148): warning C6011: Dereferencing NULL pointer '_Mycont'. : Lines: 146, 147, 148
         target_compile_options(${target_name} PRIVATE "$<$<COMPILE_LANGUAGE:CUDA>:SHELL:--compiler-options /wd6011>" )
       endif()
     else()
@@ -920,7 +920,7 @@ endfunction()
 function(onnxruntime_configure_target target_name)
   target_link_directories(${target_name} PRIVATE ${onnxruntime_LINK_DIRS})
   onnxruntime_set_compile_flags(${target_name})
-  onnxruntime_set_source_file_properties(${target_name})  
+  onnxruntime_set_source_file_properties(${target_name})
   if(WIN32 AND onnxruntime_ENABLE_STATIC_ANALYSIS AND onnxruntime_USE_CUSTOM_STATIC_ANALYSIS_RULES)
     set_target_properties(${target_name} PROPERTIES VS_USER_PROPS ${PROJECT_SOURCE_DIR}/EnableVisualStudioCodeAnalysis.props)
   endif()
diff --git a/include/onnxruntime/core/framework/allocator.h b/include/onnxruntime/core/framework/allocator.h
index de3b35f305921..938c286879b71 100644
--- a/include/onnxruntime/core/framework/allocator.h
+++ b/include/onnxruntime/core/framework/allocator.h
@@ -16,20 +16,24 @@ struct OrtArenaCfg {
                   arena_extend_strategy(-1),
                   initial_chunk_size_bytes(-1),
                   max_dead_bytes_per_chunk(-1),
-                  initial_growth_chunk_size_bytes(-1) {}
+                  initial_growth_chunk_size_bytes(-1),
+                  max_power_of_two_extend_bytes(-1) {}
   OrtArenaCfg(size_t max_mem, int arena_extend_strategy, int initial_chunk_size_bytes,
-              int max_dead_bytes_per_chunk, int initial_growth_chunk_size_bytes)
+              int max_dead_bytes_per_chunk, int initial_growth_chunk_size_bytes,
+              int64_t max_power_of_two_extend_bytes)
       : max_mem(max_mem),
         arena_extend_strategy(arena_extend_strategy),
         initial_chunk_size_bytes(initial_chunk_size_bytes),
         max_dead_bytes_per_chunk(max_dead_bytes_per_chunk),
-        initial_growth_chunk_size_bytes(initial_growth_chunk_size_bytes) {}
-
-  size_t max_mem;                       // use 0 to allow ORT to choose the default
-  int arena_extend_strategy;            // use -1 to allow ORT to choose the default, 0 = kNextPowerOfTwo, 1 = kSameAsRequested
-  int initial_chunk_size_bytes;         // use -1 to allow ORT to choose the default
-  int max_dead_bytes_per_chunk;         // use -1 to allow ORT to choose the default
-  int initial_growth_chunk_size_bytes;  // use -1 to allow ORT to choose the default
+        initial_growth_chunk_size_bytes(initial_growth_chunk_size_bytes),
+        max_power_of_two_extend_bytes(max_power_of_two_extend_bytes) {}
+
+  size_t max_mem;                         // use 0 to allow ORT to choose the default
+  int arena_extend_strategy;              // use -1 to allow ORT to choose the default, 0 = kNextPowerOfTwo, 1 = kSameAsRequested
+  int initial_chunk_size_bytes;           // use -1 to allow ORT to choose the default
+  int max_dead_bytes_per_chunk;           // use -1 to allow ORT to choose the default
+  int initial_growth_chunk_size_bytes;    // use -1 to allow ORT to choose the default
+  int64_t max_power_of_two_extend_bytes;  // use -1 to allow ORT to choose the default
 };
 
 namespace onnxruntime {
diff --git a/include/onnxruntime/core/session/onnxruntime_c_api.h b/include/onnxruntime/core/session/onnxruntime_c_api.h
index 854e65a6dccc7..10c63e67fe2cc 100644
--- a/include/onnxruntime/core/session/onnxruntime_c_api.h
+++ b/include/onnxruntime/core/session/onnxruntime_c_api.h
@@ -2728,6 +2728,10 @@ struct OrtApi {
    *  crossing which the current chunk is chunked into 2.
    * "initial_growth_chunk_size_bytes": (Possible) Size of the second allocation in the arena.
    *  Only relevant if arena strategy is `kNextPowerOfTwo`. Use -1 to allow ORT to choose the default.
+   * "max_power_of_two_extend_bytes": The maximum enxtend size if arena strategy is `kNextPowerOfTwo`.
+   *  It is not an allocation limit, it is only a limit for extention when requested byte is less than the limit.
+   *  When requested bytes is more than the limit, allocator will still return as requested.
+   *  Use -1 to allow ORT to choose the default 1GB for max_power_of_two_extend_bytes.
    *  Ultimately, the allocation size is determined by the allocation memory request.
    *  Further allocation sizes are governed by the arena extend strategy.
    *
diff --git a/include/onnxruntime/core/session/onnxruntime_cxx_api.h b/include/onnxruntime/core/session/onnxruntime_cxx_api.h
index 2d5e1a9bddeec..949b2a91d27d9 100644
--- a/include/onnxruntime/core/session/onnxruntime_cxx_api.h
+++ b/include/onnxruntime/core/session/onnxruntime_cxx_api.h
@@ -1526,6 +1526,7 @@ struct ArenaCfg : detail::Base<OrtArenaCfg> {
    * \param arena_extend_strategy -  use -1 to allow ORT to choose the default, 0 = kNextPowerOfTwo, 1 = kSameAsRequested
    * \param initial_chunk_size_bytes - use -1 to allow ORT to choose the default
    * \param max_dead_bytes_per_chunk - use -1 to allow ORT to choose the default
+   * \param max_power_of_two_extend_bytes - use -1 to allow ORT to choose the default
    * See docs/C_API.md for details on what the following parameters mean and how to choose these values
    */
   ArenaCfg(size_t max_mem, int arena_extend_strategy, int initial_chunk_size_bytes, int max_dead_bytes_per_chunk);
diff --git a/onnxruntime/contrib_ops/cuda/bert/attention.cc b/onnxruntime/contrib_ops/cuda/bert/attention.cc
index bd1498c0f9de7..ada7bc5fa2597 100644
--- a/onnxruntime/contrib_ops/cuda/bert/attention.cc
+++ b/onnxruntime/contrib_ops/cuda/bert/attention.cc
@@ -164,7 +164,7 @@ Status Attention<T>::ComputeInternal(OpKernelContext* context) const {
                                         has_memory_efficient_attention(sm, sizeof(T) == 2);
 #else
   constexpr bool use_memory_efficient_attention = false;
-  ORT_UNUSED_VARIABLE(is_mask_1d_key_seq_len_start);
+  ORT_UNUSED_PARAMETER(is_mask_1d_key_seq_len_start);
 #endif
 
   cublasHandle_t cublas = GetCublasHandle(context);
diff --git a/onnxruntime/contrib_ops/cuda/bert/multihead_attention.cc b/onnxruntime/contrib_ops/cuda/bert/multihead_attention.cc
index 9d68ad7d7d7fa..490c7275c7412 100644
--- a/onnxruntime/contrib_ops/cuda/bert/multihead_attention.cc
+++ b/onnxruntime/contrib_ops/cuda/bert/multihead_attention.cc
@@ -182,6 +182,7 @@ Status MultiHeadAttention<T>::ComputeInternal(OpKernelContext* context) const {
                                         has_memory_efficient_attention(sm, sizeof(T) == 2);
 #else
   constexpr bool use_memory_efficient_attention = false;
+  ORT_UNUSED_PARAMETER(is_mask_1d_key_seq_len_start);
 #endif
 
   // When packed kv or packed qkv is used, there is no needed for add bias transpose thus no qkv workspace.
diff --git a/onnxruntime/core/framework/allocatormgr.cc b/onnxruntime/core/framework/allocatormgr.cc
index 192206fbd2206..44e0e9c4e8ad2 100644
--- a/onnxruntime/core/framework/allocatormgr.cc
+++ b/onnxruntime/core/framework/allocatormgr.cc
@@ -42,6 +42,9 @@ AllocatorPtr CreateAllocator(const AllocatorCreationInfo& info) {
     int initial_growth_chunk_size_bytes = info.arena_cfg.initial_growth_chunk_size_bytes == -1
                                               ? BFCArena::DEFAULT_INITIAL_GROWTH_CHUNK_SIZE_BYTES
                                               : info.arena_cfg.initial_growth_chunk_size_bytes;
+    int64_t max_power_of_two_extend_bytes = info.arena_cfg.max_power_of_two_extend_bytes == -1
+                                                ? BFCArena::DEFAULT_MAX_POWER_OF_TWO_EXTEND_BYTES
+                                                : info.arena_cfg.max_power_of_two_extend_bytes;
     ArenaExtendStrategy arena_extend_str;
     switch (info.arena_cfg.arena_extend_strategy) {
       case static_cast<int>(ArenaExtendStrategy::kSameAsRequested):
@@ -77,7 +80,8 @@ AllocatorPtr CreateAllocator(const AllocatorCreationInfo& info) {
                                      arena_extend_str,
                                      initial_chunk_size_bytes,
                                      max_dead_bytes_per_chunk,
-                                     initial_growth_chunk_size_bytes));
+                                     initial_growth_chunk_size_bytes,
+                                     max_power_of_two_extend_bytes));
     }
   } else {
     return device_allocator;
diff --git a/onnxruntime/core/framework/allocatormgr.h b/onnxruntime/core/framework/allocatormgr.h
index 7a92b80c674d4..56ea4e443ff40 100644
--- a/onnxruntime/core/framework/allocatormgr.h
+++ b/onnxruntime/core/framework/allocatormgr.h
@@ -18,7 +18,7 @@ struct AllocatorCreationInfo {
   AllocatorCreationInfo(AllocatorFactory device_alloc_factory,
                         OrtDevice::DeviceId device_id = 0,
                         bool use_arena = true,
-                        OrtArenaCfg arena_cfg = {0, -1, -1, -1, -1},
+                        OrtArenaCfg arena_cfg = {0, -1, -1, -1, -1, -1L},
                         bool stream_aware_arena = false,
                         bool cross_stream_reusing = false)
       : device_alloc_factory(device_alloc_factory),
diff --git a/onnxruntime/core/framework/bfc_arena.cc b/onnxruntime/core/framework/bfc_arena.cc
index 097e8e9eadad2..13f9656ae0595 100644
--- a/onnxruntime/core/framework/bfc_arena.cc
+++ b/onnxruntime/core/framework/bfc_arena.cc
@@ -11,7 +11,8 @@ BFCArena::BFCArena(std::unique_ptr<IAllocator> resource_allocator,
                    ArenaExtendStrategy arena_extend_strategy,
                    int initial_chunk_size_bytes,
                    int max_dead_bytes_per_chunk,
-                   int initial_growth_chunk_size_bytes)
+                   int initial_growth_chunk_size_bytes,
+                   int64_t max_power_of_two_extend_bytes)
     : IAllocator(OrtMemoryInfo(resource_allocator->Info().name,
                                OrtAllocatorType::OrtArenaAllocator,
                                resource_allocator->Info().device,
@@ -23,11 +24,13 @@ BFCArena::BFCArena(std::unique_ptr<IAllocator> resource_allocator,
       next_allocation_id_(1),
       initial_chunk_size_bytes_(initial_chunk_size_bytes),
       max_dead_bytes_per_chunk_(max_dead_bytes_per_chunk),
-      initial_growth_chunk_size_bytes_(initial_growth_chunk_size_bytes) {
+      initial_growth_chunk_size_bytes_(initial_growth_chunk_size_bytes),
+      max_power_of_two_extend_bytes_(max_power_of_two_extend_bytes) {
   LOGS_DEFAULT(INFO) << "Creating BFCArena for " << device_allocator_->Info().name
                      << " with following configs: initial_chunk_size_bytes: " << initial_chunk_size_bytes_
                      << " max_dead_bytes_per_chunk: " << max_dead_bytes_per_chunk_
                      << " initial_growth_chunk_size_bytes: " << initial_growth_chunk_size_bytes_
+                     << " max_power_of_two_extend_bytes: " << max_power_of_two_extend_bytes_
                      << " memory limit: " << total_memory
                      << " arena_extend_strategy: " << static_cast<int32_t>(arena_extend_strategy);
 
@@ -144,7 +147,12 @@ Status BFCArena::Extend(size_t rounded_bytes) {
       // we allocated the same number of bytes as the current region
       // the 2x is to double the minimum size of the next amount we'll allocate
       if (!increased_allocation) {
-        curr_region_allocation_bytes_ *= 2;
+        if (arena_extend_strategy_ == ArenaExtendStrategy::kNextPowerOfTwo &&
+            curr_region_allocation_bytes_ * 2 < max_power_of_two_extend_bytes_) {
+          curr_region_allocation_bytes_ *= 2;
+        } else {
+          curr_region_allocation_bytes_ = max_power_of_two_extend_bytes_;
+        }
       }
     } else if (arena_extend_strategy_ == ArenaExtendStrategy::kSameAsRequested) {
       // BFC Arena could cause internal and external fragmentation. But, running training with
@@ -847,13 +855,15 @@ StreamAwareArena::StreamAwareArena(std::unique_ptr<IAllocator> resource_allocato
                                    ArenaExtendStrategy arena_extend_strategy,
                                    int initial_chunk_size_bytes,
                                    int max_dead_bytes_per_chunk,
-                                   int initial_growth_chunk_size_bytes) : BFCArena(std::move(resource_allocator),
-                                                                                   total_memory,
-                                                                                   arena_extend_strategy,
-                                                                                   initial_chunk_size_bytes,
-                                                                                   max_dead_bytes_per_chunk,
-                                                                                   initial_growth_chunk_size_bytes),
-                                                                          enable_cross_stream_reusing_(enable_cross_stream_sharing) {
+                                   int initial_growth_chunk_size_bytes,
+                                   int64_t max_power_of_two_extend_bytes) : BFCArena(std::move(resource_allocator),
+                                                                                     total_memory,
+                                                                                     arena_extend_strategy,
+                                                                                     initial_chunk_size_bytes,
+                                                                                     max_dead_bytes_per_chunk,
+                                                                                     initial_growth_chunk_size_bytes,
+                                                                                     max_power_of_two_extend_bytes),
+                                                                            enable_cross_stream_reusing_(enable_cross_stream_sharing) {
   arena_type_ = ArenaType::StreamAwareArena;
 }
 
diff --git a/onnxruntime/core/framework/bfc_arena.h b/onnxruntime/core/framework/bfc_arena.h
index 311a9ae7869f0..e16b90ded3381 100644
--- a/onnxruntime/core/framework/bfc_arena.h
+++ b/onnxruntime/core/framework/bfc_arena.h
@@ -59,6 +59,7 @@ class BFCArena : public IAllocator {
   static const int DEFAULT_INITIAL_CHUNK_SIZE_BYTES = 1 * 1024 * 1024;
   static const int DEFAULT_MAX_DEAD_BYTES_PER_CHUNK = 128 * 1024 * 1024;
   static const int DEFAULT_INITIAL_GROWTH_CHUNK_SIZE_BYTES = 2 * 1024 * 1024;
+  static const int64_t DEFAULT_MAX_POWER_OF_TWO_EXTEND_BYTES = 1024 * 1024 * 1024;  // 1GB
   static const size_t DEFAULT_MAX_MEM = std::numeric_limits<size_t>::max();
 
   enum ArenaType {
@@ -71,7 +72,8 @@ class BFCArena : public IAllocator {
            ArenaExtendStrategy arena_extend_strategy = DEFAULT_ARENA_EXTEND_STRATEGY,
            int initial_chunk_size_bytes = DEFAULT_INITIAL_CHUNK_SIZE_BYTES,
            int max_dead_bytes_per_chunk = DEFAULT_MAX_DEAD_BYTES_PER_CHUNK,
-           int initial_growth_chunk_size_bytes = DEFAULT_INITIAL_GROWTH_CHUNK_SIZE_BYTES);
+           int initial_growth_chunk_size_bytes = DEFAULT_INITIAL_GROWTH_CHUNK_SIZE_BYTES,
+           int64_t max_power_of_two_extend_bytes = DEFAULT_MAX_POWER_OF_TWO_EXTEND_BYTES);
 
   ~BFCArena() override;
 
@@ -505,6 +507,7 @@ class BFCArena : public IAllocator {
   const int initial_chunk_size_bytes_;
   const int max_dead_bytes_per_chunk_;
   const int initial_growth_chunk_size_bytes_;
+  const int64_t max_power_of_two_extend_bytes_;
 
   // This flag is only relevant if Shrink() is invoked.
   // This is a boolean flag that controls whether the first allocation region
@@ -522,7 +525,8 @@ class StreamAwareArena : public BFCArena {
                    ArenaExtendStrategy arena_extend_strategy = DEFAULT_ARENA_EXTEND_STRATEGY,
                    int initial_chunk_size_bytes = DEFAULT_INITIAL_CHUNK_SIZE_BYTES,
                    int max_dead_bytes_per_chunk = DEFAULT_MAX_DEAD_BYTES_PER_CHUNK,
-                   int initial_growth_chunk_size_bytes = DEFAULT_INITIAL_GROWTH_CHUNK_SIZE_BYTES);
+                   int initial_growth_chunk_size_bytes = DEFAULT_INITIAL_GROWTH_CHUNK_SIZE_BYTES,
+                   int64_t max_power_of_two_extend_bytes = DEFAULT_MAX_POWER_OF_TWO_EXTEND_BYTES);
 
   // If size is 0, then this function returns either NULL,
   // or a unique pointer value that can later be successfully
diff --git a/onnxruntime/core/providers/cuda/cuda_execution_provider.cc b/onnxruntime/core/providers/cuda/cuda_execution_provider.cc
index c0acd365c1374..74424371e6abc 100644
--- a/onnxruntime/core/providers/cuda/cuda_execution_provider.cc
+++ b/onnxruntime/core/providers/cuda/cuda_execution_provider.cc
@@ -149,7 +149,7 @@ AllocatorPtr CUDAExecutionProvider::CreateCudaAllocator(OrtDevice::DeviceId devi
         device_id,
         true,
         {default_memory_arena_cfg ? *default_memory_arena_cfg
-                                  : OrtArenaCfg(gpu_mem_limit, static_cast<int>(arena_extend_strategy), -1, -1, -1)},
+                                  : OrtArenaCfg(gpu_mem_limit, static_cast<int>(arena_extend_strategy), -1, -1, -1, -1L)},
         // make it stream aware
         true,
         // enable cross stream sharing?
diff --git a/onnxruntime/core/session/environment.cc b/onnxruntime/core/session/environment.cc
index 3a20dc23116a6..dc225b131db7e 100644
--- a/onnxruntime/core/session/environment.cc
+++ b/onnxruntime/core/session/environment.cc
@@ -135,6 +135,7 @@ Status Environment::CreateAndRegisterAllocator(const OrtMemoryInfo& mem_info, co
     int initial_chunk_size_bytes = -1;
     int max_dead_bytes_per_chunk = -1;
     int initial_growth_chunk_size_bytes = -1;
+    int64_t max_power_of_two_extend_bytes = -1L;
 
     // override with values from the user supplied arena_cfg object
     if (arena_cfg) {
@@ -151,10 +152,11 @@ Status Environment::CreateAndRegisterAllocator(const OrtMemoryInfo& mem_info, co
       initial_chunk_size_bytes = arena_cfg->initial_chunk_size_bytes;
       max_dead_bytes_per_chunk = arena_cfg->max_dead_bytes_per_chunk;
       initial_growth_chunk_size_bytes = arena_cfg->initial_growth_chunk_size_bytes;
+      max_power_of_two_extend_bytes = arena_cfg->max_power_of_two_extend_bytes;
     }
 
     OrtArenaCfg l_arena_cfg{max_mem, arena_extend_strategy, initial_chunk_size_bytes, max_dead_bytes_per_chunk,
-                            initial_growth_chunk_size_bytes};
+                            initial_growth_chunk_size_bytes, max_power_of_two_extend_bytes};
     AllocatorCreationInfo alloc_creation_info{
         [mem_info](int) { return std::make_unique<CPUAllocator>(mem_info); },
         0,
diff --git a/onnxruntime/core/session/onnxruntime_c_api.cc b/onnxruntime/core/session/onnxruntime_c_api.cc
index 708cb825a661e..9c909068056a5 100644
--- a/onnxruntime/core/session/onnxruntime_c_api.cc
+++ b/onnxruntime/core/session/onnxruntime_c_api.cc
@@ -2233,6 +2233,7 @@ ORT_API_STATUS_IMPL(OrtApis::CreateArenaCfg, _In_ size_t max_mem, int arena_exte
   cfg->arena_extend_strategy = arena_extend_strategy;
   cfg->initial_chunk_size_bytes = initial_chunk_size_bytes;
   cfg->max_dead_bytes_per_chunk = max_dead_bytes_per_chunk;
+  cfg->max_dead_bytes_per_chunk = -1L;
   *out = cfg.release();
   return nullptr;
   API_IMPL_END
@@ -2254,6 +2255,8 @@ ORT_API_STATUS_IMPL(OrtApis::CreateArenaCfgV2, _In_reads_(num_keys) const char*
       cfg->max_dead_bytes_per_chunk = static_cast<int>(arena_config_values[i]);
     } else if (strcmp(arena_config_keys[i], "initial_growth_chunk_size_bytes") == 0) {
       cfg->initial_growth_chunk_size_bytes = static_cast<int>(arena_config_values[i]);
+    } else if (strcmp(arena_config_keys[i], "max_power_of_two_extend_bytes") == 0) {
+      cfg->max_power_of_two_extend_bytes = static_cast<int64_t>(arena_config_values[i]);
     } else {
       std::ostringstream oss;
       oss << "Invalid key found: " << arena_config_keys[i];
diff --git a/onnxruntime/python/onnxruntime_pybind_state.cc b/onnxruntime/python/onnxruntime_pybind_state.cc
index cf709e508dbf3..fe3bb46c92cf6 100644
--- a/onnxruntime/python/onnxruntime_pybind_state.cc
+++ b/onnxruntime/python/onnxruntime_pybind_state.cc
@@ -1148,8 +1148,8 @@ void addObjectMethods(py::module& m, ExecutionProviderRegistrationFn ep_registra
       .def_static("default_memory", []() { return OrtDevice::MemType::DEFAULT; });
 
   py::class_<OrtArenaCfg> ort_arena_cfg_binding(m, "OrtArenaCfg");
-  // Note: Doesn't expose initial_growth_chunk_sizes_bytes option. This constructor kept for
-  // backwards compatibility, key-value pair constructor overload exposes all options
+  // Note: Doesn't expose initial_growth_chunk_sizes_bytes/max_power_of_two_extend_bytes option.
+  // This constructor kept for backwards compatibility, key-value pair constructor overload exposes all options
   // There is a global var: arena_extend_strategy, which means we can't use that var name here
   // See docs/C_API.md for details on what the following parameters mean and how to choose these values
   ort_arena_cfg_binding.def(py::init([](size_t max_mem, int arena_extend_strategy_local,
@@ -1175,6 +1175,8 @@ void addObjectMethods(py::module& m, ExecutionProviderRegistrationFn ep_registra
             ort_arena_cfg->max_dead_bytes_per_chunk = kvp.second.cast<int>();
           } else if (key == "initial_growth_chunk_size_bytes") {
             ort_arena_cfg->initial_growth_chunk_size_bytes = kvp.second.cast<int>();
+          } else if (key == "max_power_of_two_extend_bytes") {
+            ort_arena_cfg->max_power_of_two_extend_bytes = kvp.second.cast<int>();
           } else {
             ORT_THROW("Invalid OrtArenaCfg option: ", key);
           }
@@ -1185,7 +1187,8 @@ void addObjectMethods(py::module& m, ExecutionProviderRegistrationFn ep_registra
       .def_readwrite("arena_extend_strategy", &OrtArenaCfg::arena_extend_strategy)
       .def_readwrite("initial_chunk_size_bytes", &OrtArenaCfg::initial_chunk_size_bytes)
       .def_readwrite("max_dead_bytes_per_chunk", &OrtArenaCfg::max_dead_bytes_per_chunk)
-      .def_readwrite("initial_growth_chunk_size_bytes", &OrtArenaCfg::initial_growth_chunk_size_bytes);
+      .def_readwrite("initial_growth_chunk_size_bytes", &OrtArenaCfg::initial_growth_chunk_size_bytes)
+      .def_readwrite("max_power_of_two_extend_bytes", &OrtArenaCfg::max_power_of_two_extend_bytes);
 
   py::class_<OrtMemoryInfo> ort_memory_info_binding(m, "OrtMemoryInfo");
   ort_memory_info_binding.def(py::init([](const char* name, OrtAllocatorType type, int id, OrtMemType mem_type) {
diff --git a/onnxruntime/test/framework/bfc_arena_test.cc b/onnxruntime/test/framework/bfc_arena_test.cc
index 584a0a32945b3..a6034f2ea0b79 100644
--- a/onnxruntime/test/framework/bfc_arena_test.cc
+++ b/onnxruntime/test/framework/bfc_arena_test.cc
@@ -2,6 +2,7 @@
 // Licensed under the MIT License.
 
 #include "core/framework/bfc_arena.h"
+#include "core/framework/allocatormgr.h"
 #include "gtest/gtest.h"
 #include "gmock/gmock.h"
 #include <cstdlib>
@@ -409,5 +410,70 @@ TEST(StreamAwareArenaTest, TestSecureTheChunk) {
   a.Free(p2);
 }
 
+TEST(BFCArenaTest, TestExtendStrategy) {
+  int64_t extend_delta_bytes = 0;
+  {
+    // Use kNextPowerOfTwo strategy with default extension limit: 1GB.
+    BFCArena a(std::unique_ptr<IAllocator>(new CPUAllocator()), 1UL << 30, ArenaExtendStrategy::kNextPowerOfTwo);
+    size_t block_size = 1 << 20;  // 1MB
+    a.Alloc(block_size);
+    AllocatorStats stats;
+    a.GetStats(&stats);
+    int64_t prev_allocated_bytes = stats.total_allocated_bytes;
+    extend_delta_bytes = stats.total_allocated_bytes;
+    ASSERT_EQ(extend_delta_bytes, static_cast<int64_t>(block_size));
+    for (int i = 1; i < 256; ++i) {
+      a.Alloc(block_size);
+      a.GetStats(&stats);
+      if (stats.total_allocated_bytes != prev_allocated_bytes) {
+        int64_t new_delta_bytes = stats.total_allocated_bytes - prev_allocated_bytes;
+        ASSERT_EQ(new_delta_bytes, 2 * extend_delta_bytes);
+        extend_delta_bytes = new_delta_bytes;
+        prev_allocated_bytes = stats.total_allocated_bytes;
+      }
+    }
+  }
+  int64_t extend_limit = 1 << 25;  // 32MB
+  ASSERT_GT(extend_delta_bytes, extend_limit);
+  extend_delta_bytes = 0;
+  {
+    // Use kNextPowerOfTwo strategy with much smaller extension limit: 32MB.
+    OrtArenaCfg config(0, 0, -1, -1, -1, extend_limit);
+    AllocatorCreationInfo device_info{
+        [](OrtDevice::DeviceId) { return std::make_unique<CPUAllocator>(); },
+        0, true, config};
+    auto allocator = CreateAllocator(device_info);
+    size_t block_size = 1 << 20;  // 1MB
+    BFCArena& a = *static_cast<BFCArena*>(allocator.get());
+    a.Alloc(block_size);
+    AllocatorStats stats;
+    a.GetStats(&stats);
+    int64_t prev_allocated_bytes = stats.total_allocated_bytes;
+    extend_delta_bytes = stats.total_allocated_bytes;
+    ASSERT_EQ(extend_delta_bytes, static_cast<int64_t>(block_size));
+    int reach_limit_count = 0;
+    for (int i = 1; i < 256; ++i) {
+      a.Alloc(block_size);
+      a.GetStats(&stats);
+      if (stats.total_allocated_bytes != prev_allocated_bytes) {
+        int64_t new_delta_bytes = stats.total_allocated_bytes - prev_allocated_bytes;
+        if (new_delta_bytes < extend_limit) {
+          ASSERT_EQ(new_delta_bytes, 2 * extend_delta_bytes) << "index:" << i;
+        } else {
+          // The increasing of new chunk reaches the limit.
+          ++reach_limit_count;
+          ASSERT_EQ(new_delta_bytes, extend_limit);
+        }
+        extend_delta_bytes = new_delta_bytes;
+        prev_allocated_bytes = stats.total_allocated_bytes;
+      }
+    }
+    ASSERT_GT(reach_limit_count, 2);
+    // It is OK to allocate more than extend_limit.
+    ASSERT_NE(a.Alloc(block_size * 64), nullptr);
+  }
+  ASSERT_EQ(extend_delta_bytes, extend_limit);
+}
+
 }  // namespace test
 }  // namespace onnxruntime
diff --git a/onnxruntime/test/python/onnxruntime_test_python.py b/onnxruntime/test/python/onnxruntime_test_python.py
index 416cdd5ddaff3..b49547a040907 100644
--- a/onnxruntime/test/python/onnxruntime_test_python.py
+++ b/onnxruntime/test/python/onnxruntime_test_python.py
@@ -1314,7 +1314,7 @@ def testSharedAllocatorUsingCreateAndRegisterAllocator(self):  # noqa: N802
         # Create and register an arena based allocator
 
         # To create an OrtArenaCfg using non-default parameters, use one of below templates:
-        # ort_arena_cfg = onnxrt.OrtArenaCfg(0, -1, -1, -1) - Note: doesn't expose initial_growth_chunk_size_bytes option
+        # ort_arena_cfg = onnxrt.OrtArenaCfg(0, -1, -1, -1) - Note: doesn't expose initial_growth_chunk_size_bytes/max_power_of_two_extend_bytes option
         # ort_arena_cfg = onnxrt.OrtArenaCfg({"max_mem": -1, ""arena_extend_strategy": 1, etc..})
         ort_memory_info = onnxrt.OrtMemoryInfo(
             "Cpu",
@@ -1482,6 +1482,8 @@ def verify_allocator(allocator, expected_config):
                     self.assertEqual(allocator.max_dead_bytes_per_chunk, val)
                 elif key == "initial_growth_chunk_size_bytes":
                     self.assertEqual(allocator.initial_growth_chunk_size_bytes, val)
+                elif key == "max_power_of_two_extend_bytes":
+                    self.assertEqual(allocator.max_power_of_two_extend_bytes, val)
                 else:
                     raise ValueError("Invalid OrtArenaCfg option: " + key)
 
@@ -1506,6 +1508,18 @@ def verify_allocator(allocator, expected_config):
         ort_arena_cfg_kvp = onnxrt.OrtArenaCfg(expected_kvp_allocator)
         verify_allocator(ort_arena_cfg_kvp, expected_kvp_allocator)
 
+        # Verify key-value pair initialization
+        expected_kvp_allocator = {
+            "max_mem": 32,
+            "arena_extend_strategy": 11,
+            "initial_chunk_size_bytes": 18,
+            "max_dead_bytes_per_chunk": 14,
+            "initial_growth_chunk_size_bytes": 12,
+            "max_power_of_two_extend_bytes": 17,
+        }
+        ort_arena_cfg_kvp = onnxrt.OrtArenaCfg(expected_kvp_allocator)
+        verify_allocator(ort_arena_cfg_kvp, expected_kvp_allocator)
+
 
 if __name__ == "__main__":
     unittest.main(verbosity=1)
diff --git a/onnxruntime/test/shared_lib/test_inference.cc b/onnxruntime/test/shared_lib/test_inference.cc
index f53a93225b524..8049d7baddb6c 100644
--- a/onnxruntime/test/shared_lib/test_inference.cc
+++ b/onnxruntime/test/shared_lib/test_inference.cc
@@ -2499,8 +2499,8 @@ TEST(CApiTest, ConfigureCudaArenaAndDemonstrateMemoryArenaShrinkage) {
 
   Ort::SessionOptions session_options;
 
-  const char* keys[] = {"max_mem", "arena_extend_strategy", "initial_chunk_size_bytes", "max_dead_bytes_per_chunk", "initial_growth_chunk_size_bytes"};
-  const size_t values[] = {0 /*let ort pick default max memory*/, 0, 1024, 0, 256};
+  const char* keys[] = {"max_mem", "arena_extend_strategy", "initial_chunk_size_bytes", "max_dead_bytes_per_chunk", "initial_growth_chunk_size_bytes", "max_power_of_two_extend_bytes"};
+  const size_t values[] = {0 /*let ort pick default max memory*/, 0, 1024, 0, 256, 1L << 24};
 
   OrtArenaCfg* arena_cfg = nullptr;
   ASSERT_TRUE(api.CreateArenaCfgV2(keys, values, 5, &arena_cfg) == nullptr);
diff --git a/onnxruntime/test/testdata/custom_execution_provider_library/my_execution_provider.cc b/onnxruntime/test/testdata/custom_execution_provider_library/my_execution_provider.cc
index cf3c2ae39fa46..b0fb0347c7977 100644
--- a/onnxruntime/test/testdata/custom_execution_provider_library/my_execution_provider.cc
+++ b/onnxruntime/test/testdata/custom_execution_provider_library/my_execution_provider.cc
@@ -19,7 +19,7 @@ MyExecutionProvider::MyExecutionProvider(const MyProviderInfo& info)
       [](OrtDevice::DeviceId device_id) { return std::make_unique<MyEPAllocator>(device_id); },
       device_id_,
       true,
-      {0, 1, -1, -1, -1}};
+      {0, 1, -1, -1, -1, -1L}};
   InsertAllocator(CreateAllocator(device_info));
 }