From e7cd6827bb75b150e61c7438ad171ccaab86fd55 Mon Sep 17 00:00:00 2001
From: robot-piglet <robot-piglet@yandex-team.com>
Date: Fri, 29 Nov 2024 17:33:55 +0300
Subject: [PATCH 01/16] Intermediate changes
 commit_hash:b11b6c72612e89eb992dacf3b068f9cc57067720

---
 .../benchmark/.yandex_meta/override.nix       |  4 +-
 .../benchmark/include/benchmark/benchmark.h   | 52 ++++++++++++++++++-
 .../google/benchmark/src/benchmark.cc         | 14 +++++
 .../google/benchmark/src/benchmark_runner.cc  | 43 ++++++++++-----
 .../google/benchmark/src/benchmark_runner.h   |  7 ---
 .../google/benchmark/src/colorprint.cc        | 16 ++++--
 .../google/benchmark/src/perf_counters.cc     |  2 -
 .../google/benchmark/src/sysinfo.cc           |  6 +++
 .../restricted/google/benchmark/test/ya.make  |  2 +-
 .../google/benchmark/tools/compare/ya.make    |  4 +-
 .../restricted/google/benchmark/tools/ya.make |  2 +-
 contrib/restricted/google/benchmark/ya.make   |  6 +--
 12 files changed, 120 insertions(+), 38 deletions(-)
diff --git a/contrib/restricted/google/benchmark/.yandex_meta/override.nix b/contrib/restricted/google/benchmark/.yandex_meta/override.nix
index 1eaa98bdf512..13fbc4444632 100644
--- a/contrib/restricted/google/benchmark/.yandex_meta/override.nix
+++ b/contrib/restricted/google/benchmark/.yandex_meta/override.nix
@@ -1,11 +1,11 @@
 pkgs: attrs: with pkgs; with attrs; rec {
-  version = "1.9.0";
+  version = "1.9.1";
 
   src = fetchFromGitHub {
     owner = "google";
     repo = "benchmark";
     rev = "v${version}";
-    hash = "sha256-5cl1PIjhXaL58kSyWZXRWLq6BITS2BwEovPhwvk2e18=";
+    hash = "sha256-5xDg1duixLoWIuy59WT0r5ZBAvTR6RPP7YrhBYkMxc8=";
   };
 
   buildInputs = [ gtest ];
diff --git a/contrib/restricted/google/benchmark/include/benchmark/benchmark.h b/contrib/restricted/google/benchmark/include/benchmark/benchmark.h
index 4cdb4515cb94..86f9dbbabb52 100644
--- a/contrib/restricted/google/benchmark/include/benchmark/benchmark.h
+++ b/contrib/restricted/google/benchmark/include/benchmark/benchmark.h
@@ -290,11 +290,50 @@ BENCHMARK(BM_test)->Unit(benchmark::kMillisecond);
 #define BENCHMARK_OVERRIDE
 #endif
 
+#if defined(__GNUC__)
+// Determine the cacheline size based on architecture
+#if defined(__i386__) || defined(__x86_64__)
+#define BENCHMARK_INTERNAL_CACHELINE_SIZE 64
+#elif defined(__powerpc64__)
+#define BENCHMARK_INTERNAL_CACHELINE_SIZE 128
+#elif defined(__aarch64__)
+#define BENCHMARK_INTERNAL_CACHELINE_SIZE 64
+#elif defined(__arm__)
+// Cache line sizes for ARM: These values are not strictly correct since
+// cache line sizes depend on implementations, not architectures.  There
+// are even implementations with cache line sizes configurable at boot
+// time.
+#if defined(__ARM_ARCH_5T__)
+#define BENCHMARK_INTERNAL_CACHELINE_SIZE 32
+#elif defined(__ARM_ARCH_7A__)
+#define BENCHMARK_INTERNAL_CACHELINE_SIZE 64
+#endif  // ARM_ARCH
+#endif  // arches
+#endif  // __GNUC__
+
+#ifndef BENCHMARK_INTERNAL_CACHELINE_SIZE
+// A reasonable default guess.  Note that overestimates tend to waste more
+// space, while underestimates tend to waste more time.
+#define BENCHMARK_INTERNAL_CACHELINE_SIZE 64
+#endif
+
+#if defined(__GNUC__)
+// Indicates that the declared object be cache aligned using
+// `BENCHMARK_INTERNAL_CACHELINE_SIZE` (see above).
+#define BENCHMARK_INTERNAL_CACHELINE_ALIGNED \
+  __attribute__((aligned(BENCHMARK_INTERNAL_CACHELINE_SIZE)))
+#elif defined(_MSC_VER)
+#define BENCHMARK_INTERNAL_CACHELINE_ALIGNED \
+  __declspec(align(BENCHMARK_INTERNAL_CACHELINE_SIZE))
+#else
+#define BENCHMARK_INTERNAL_CACHELINE_ALIGNED
+#endif
+
 #if defined(_MSC_VER)
 #pragma warning(push)
 // C4251: <symbol> needs to have dll-interface to be used by clients of class
 #pragma warning(disable : 4251)
-#endif
+#endif  // _MSC_VER_
 
 namespace benchmark {
 class BenchmarkReporter;
@@ -757,9 +796,14 @@ enum Skipped
 
 }  // namespace internal
 
+#if defined(_MSC_VER)
+#pragma warning(push)
+// C4324: 'benchmark::State': structure was padded due to alignment specifier
+#pragma warning(disable : 4324)
+#endif  // _MSC_VER_
 // State is passed to a running Benchmark and contains state for the
 // benchmark to use.
-class BENCHMARK_EXPORT State {
+class BENCHMARK_EXPORT BENCHMARK_INTERNAL_CACHELINE_ALIGNED State {
  public:
   struct StateIterator;
   friend struct StateIterator;
@@ -1024,6 +1068,9 @@ class BENCHMARK_EXPORT State {
 
   friend class internal::BenchmarkInstance;
 };
+#if defined(_MSC_VER)
+#pragma warning(pop)
+#endif  // _MSC_VER_
 
 inline BENCHMARK_ALWAYS_INLINE bool State::KeepRunning() {
   return KeepRunningInternal(1, /*is_batch=*/false);
@@ -1507,6 +1554,7 @@ class Fixture : public internal::Benchmark {
   BaseClass##_##Method##_Benchmark
 
 #define BENCHMARK_PRIVATE_DECLARE(n)                                 \
+  /* NOLINTNEXTLINE(misc-use-anonymous-namespace) */                 \
   static ::benchmark::internal::Benchmark* BENCHMARK_PRIVATE_NAME(n) \
       BENCHMARK_UNUSED
 
diff --git a/contrib/restricted/google/benchmark/src/benchmark.cc b/contrib/restricted/google/benchmark/src/benchmark.cc
index b7767bd00a26..0ea90aeb6ade 100644
--- a/contrib/restricted/google/benchmark/src/benchmark.cc
+++ b/contrib/restricted/google/benchmark/src/benchmark.cc
@@ -92,6 +92,11 @@ BM_DEFINE_double(benchmark_min_warmup_time, 0.0);
 // standard deviation of the runs will be reported.
 BM_DEFINE_int32(benchmark_repetitions, 1);
 
+// If enabled, forces each benchmark to execute exactly one iteration and one
+// repetition, bypassing any configured
+// MinTime()/MinWarmUpTime()/Iterations()/Repetitions()
+BM_DEFINE_bool(benchmark_dry_run, false);
+
 // If set, enable random interleaving of repetitions of all benchmarks.
 // See http://github.com/google/benchmark/issues/1051 for details.
 BM_DEFINE_bool(benchmark_enable_random_interleaving, false);
@@ -663,6 +668,10 @@ void RegisterMemoryManager(MemoryManager* manager) {
 }
 
 void RegisterProfilerManager(ProfilerManager* manager) {
+  // Don't allow overwriting an existing manager.
+  if (manager != nullptr) {
+    BM_CHECK_EQ(internal::profiler_manager, nullptr);
+  }
   internal::profiler_manager = manager;
 }
 
@@ -717,6 +726,7 @@ void ParseCommandLineFlags(int* argc, char** argv) {
                         &FLAGS_benchmark_min_warmup_time) ||
         ParseInt32Flag(argv[i], "benchmark_repetitions",
                        &FLAGS_benchmark_repetitions) ||
+        ParseBoolFlag(argv[i], "benchmark_dry_run", &FLAGS_benchmark_dry_run) ||
         ParseBoolFlag(argv[i], "benchmark_enable_random_interleaving",
                       &FLAGS_benchmark_enable_random_interleaving) ||
         ParseBoolFlag(argv[i], "benchmark_report_aggregates_only",
@@ -755,6 +765,9 @@ void ParseCommandLineFlags(int* argc, char** argv) {
   if (FLAGS_benchmark_color.empty()) {
     PrintUsageAndExit();
   }
+  if (FLAGS_benchmark_dry_run) {
+    AddCustomContext("dry_run", "true");
+  }
   for (const auto& kv : FLAGS_benchmark_context) {
     AddCustomContext(kv.first, kv.second);
   }
@@ -783,6 +796,7 @@ void PrintDefaultHelp() {
           "          [--benchmark_min_time=`<integer>x` OR `<float>s` ]\n"
           "          [--benchmark_min_warmup_time=<min_warmup_time>]\n"
           "          [--benchmark_repetitions=<num_repetitions>]\n"
+          "          [--benchmark_dry_run={true|false}]\n"
           "          [--benchmark_enable_random_interleaving={true|false}]\n"
           "          [--benchmark_report_aggregates_only={true|false}]\n"
           "          [--benchmark_display_aggregates_only={true|false}]\n"
diff --git a/contrib/restricted/google/benchmark/src/benchmark_runner.cc b/contrib/restricted/google/benchmark/src/benchmark_runner.cc
index a38093937a07..463f69fc522d 100644
--- a/contrib/restricted/google/benchmark/src/benchmark_runner.cc
+++ b/contrib/restricted/google/benchmark/src/benchmark_runner.cc
@@ -58,6 +58,14 @@
 
 namespace benchmark {
 
+BM_DECLARE_bool(benchmark_dry_run);
+BM_DECLARE_string(benchmark_min_time);
+BM_DECLARE_double(benchmark_min_warmup_time);
+BM_DECLARE_int32(benchmark_repetitions);
+BM_DECLARE_bool(benchmark_report_aggregates_only);
+BM_DECLARE_bool(benchmark_display_aggregates_only);
+BM_DECLARE_string(benchmark_perf_counters);
+
 namespace internal {
 
 MemoryManager* memory_manager = nullptr;
@@ -126,14 +134,14 @@ BenchmarkReporter::Run CreateRunReport(
 void RunInThread(const BenchmarkInstance* b, IterationCount iters,
                  int thread_id, ThreadManager* manager,
                  PerfCountersMeasurement* perf_counters_measurement,
-                 ProfilerManager* profiler_manager) {
+                 ProfilerManager* profiler_manager_) {
   internal::ThreadTimer timer(
       b->measure_process_cpu_time()
           ? internal::ThreadTimer::CreateProcessCpuTime()
           : internal::ThreadTimer::Create());
 
   State st = b->Run(iters, thread_id, &timer, manager,
-                    perf_counters_measurement, profiler_manager);
+                    perf_counters_measurement, profiler_manager_);
   BM_CHECK(st.skipped() || st.iterations() >= st.max_iterations)
       << "Benchmark returned before State::KeepRunning() returned false!";
   {
@@ -228,20 +236,29 @@ BenchmarkRunner::BenchmarkRunner(
     : b(b_),
       reports_for_family(reports_for_family_),
       parsed_benchtime_flag(ParseBenchMinTime(FLAGS_benchmark_min_time)),
-      min_time(ComputeMinTime(b_, parsed_benchtime_flag)),
-      min_warmup_time((!IsZero(b.min_time()) && b.min_warmup_time() > 0.0)
-                          ? b.min_warmup_time()
-                          : FLAGS_benchmark_min_warmup_time),
-      warmup_done(!(min_warmup_time > 0.0)),
-      repeats(b.repetitions() != 0 ? b.repetitions()
-                                   : FLAGS_benchmark_repetitions),
+      min_time(FLAGS_benchmark_dry_run
+                   ? 0
+                   : ComputeMinTime(b_, parsed_benchtime_flag)),
+      min_warmup_time(
+          FLAGS_benchmark_dry_run
+              ? 0
+              : ((!IsZero(b.min_time()) && b.min_warmup_time() > 0.0)
+                     ? b.min_warmup_time()
+                     : FLAGS_benchmark_min_warmup_time)),
+      warmup_done(FLAGS_benchmark_dry_run ? true : !(min_warmup_time > 0.0)),
+      repeats(FLAGS_benchmark_dry_run
+                  ? 1
+                  : (b.repetitions() != 0 ? b.repetitions()
+                                          : FLAGS_benchmark_repetitions)),
       has_explicit_iteration_count(b.iterations() != 0 ||
                                    parsed_benchtime_flag.tag ==
                                        BenchTimeType::ITERS),
       pool(static_cast<size_t>(b.threads() - 1)),
-      iters(has_explicit_iteration_count
-                ? ComputeIters(b_, parsed_benchtime_flag)
-                : 1),
+      iters(FLAGS_benchmark_dry_run
+                ? 1
+                : (has_explicit_iteration_count
+                       ? ComputeIters(b_, parsed_benchtime_flag)
+                       : 1)),
       perf_counters_measurement_ptr(pcm_) {
   run_results.display_report_aggregates_only =
       (FLAGS_benchmark_report_aggregates_only ||
@@ -339,7 +356,7 @@ bool BenchmarkRunner::ShouldReportIterationResults(
   // Determine if this run should be reported;
   // Either it has run for a sufficient amount of time
   // or because an error was reported.
-  return i.results.skipped_ ||
+  return i.results.skipped_ || FLAGS_benchmark_dry_run ||
          i.iters >= kMaxIterations ||  // Too many iterations already.
          i.seconds >=
              GetMinTimeToApply() ||  // The elapsed time is large enough.
diff --git a/contrib/restricted/google/benchmark/src/benchmark_runner.h b/contrib/restricted/google/benchmark/src/benchmark_runner.h
index cd34d2d5bb0d..6e5ceb31e003 100644
--- a/contrib/restricted/google/benchmark/src/benchmark_runner.h
+++ b/contrib/restricted/google/benchmark/src/benchmark_runner.h
@@ -25,13 +25,6 @@
 
 namespace benchmark {
 
-BM_DECLARE_string(benchmark_min_time);
-BM_DECLARE_double(benchmark_min_warmup_time);
-BM_DECLARE_int32(benchmark_repetitions);
-BM_DECLARE_bool(benchmark_report_aggregates_only);
-BM_DECLARE_bool(benchmark_display_aggregates_only);
-BM_DECLARE_string(benchmark_perf_counters);
-
 namespace internal {
 
 extern MemoryManager* memory_manager;
diff --git a/contrib/restricted/google/benchmark/src/colorprint.cc b/contrib/restricted/google/benchmark/src/colorprint.cc
index abc71492f77a..fd1971ad3cd8 100644
--- a/contrib/restricted/google/benchmark/src/colorprint.cc
+++ b/contrib/restricted/google/benchmark/src/colorprint.cc
@@ -135,19 +135,25 @@ void ColorPrintf(std::ostream& out, LogColor color, const char* fmt,
   // Gets the current text color.
   CONSOLE_SCREEN_BUFFER_INFO buffer_info;
   GetConsoleScreenBufferInfo(stdout_handle, &buffer_info);
-  const WORD old_color_attrs = buffer_info.wAttributes;
+  const WORD original_color_attrs = buffer_info.wAttributes;
 
   // We need to flush the stream buffers into the console before each
   // SetConsoleTextAttribute call lest it affect the text that is already
   // printed but has not yet reached the console.
   out.flush();
-  SetConsoleTextAttribute(stdout_handle,
-                          GetPlatformColorCode(color) | FOREGROUND_INTENSITY);
+
+  const WORD original_background_attrs =
+      original_color_attrs & (BACKGROUND_RED | BACKGROUND_GREEN |
+                              BACKGROUND_BLUE | BACKGROUND_INTENSITY);
+
+  SetConsoleTextAttribute(stdout_handle, GetPlatformColorCode(color) |
+                                             FOREGROUND_INTENSITY |
+                                             original_background_attrs);
   out << FormatString(fmt, args);
 
   out.flush();
-  // Restores the text color.
-  SetConsoleTextAttribute(stdout_handle, old_color_attrs);
+  // Restores the text and background color.
+  SetConsoleTextAttribute(stdout_handle, original_color_attrs);
 #else
   const char* color_code = GetPlatformColorCode(color);
   if (color_code) out << FormatString("\033[0;3%sm", color_code);
diff --git a/contrib/restricted/google/benchmark/src/perf_counters.cc b/contrib/restricted/google/benchmark/src/perf_counters.cc
index 66ac6f0afb19..c1a83164481b 100644
--- a/contrib/restricted/google/benchmark/src/perf_counters.cc
+++ b/contrib/restricted/google/benchmark/src/perf_counters.cc
@@ -26,8 +26,6 @@
 namespace benchmark {
 namespace internal {
 
-constexpr size_t PerfCounterValues::kMaxCounters;
-
 #if defined HAVE_LIBPFM
 
 size_t PerfCounterValues::Read(const std::vector<int>& leaders) {
diff --git a/contrib/restricted/google/benchmark/src/sysinfo.cc b/contrib/restricted/google/benchmark/src/sysinfo.cc
index 617d276e4739..358e4d4230ac 100644
--- a/contrib/restricted/google/benchmark/src/sysinfo.cc
+++ b/contrib/restricted/google/benchmark/src/sysinfo.cc
@@ -353,6 +353,12 @@ std::vector<CPUInfo::CacheInfo> GetCacheSizesWindows() {
     C.size = static_cast<int>(cache.Size);
     C.type = "Unknown";
     switch (cache.Type) {
+// Windows SDK version >= 10.0.26100.0
+// 0x0A000010 is the value of NTDDI_WIN11_GE
+#if NTDDI_VERSION >= 0x0A000010
+      case CacheUnknown:
+        break;
+#endif
       case CacheUnified:
         C.type = "Unified";
         break;
diff --git a/contrib/restricted/google/benchmark/test/ya.make b/contrib/restricted/google/benchmark/test/ya.make
index 0c2cad107df4..1ef8afafb8f5 100644
--- a/contrib/restricted/google/benchmark/test/ya.make
+++ b/contrib/restricted/google/benchmark/test/ya.make
@@ -4,7 +4,7 @@ GTEST(benchmark_gtest)
 
 WITHOUT_LICENSE_TEXTS()
 
-VERSION(1.9.0)
+VERSION(1.9.1)
 
 LICENSE(Apache-2.0)
 
diff --git a/contrib/restricted/google/benchmark/tools/compare/ya.make b/contrib/restricted/google/benchmark/tools/compare/ya.make
index cf364865b5aa..0d4aa76cfb85 100644
--- a/contrib/restricted/google/benchmark/tools/compare/ya.make
+++ b/contrib/restricted/google/benchmark/tools/compare/ya.make
@@ -4,9 +4,9 @@ PY3_PROGRAM()
 
 WITHOUT_LICENSE_TEXTS()
 
-VERSION(1.9.0)
+VERSION(1.9.1)
 
-ORIGINAL_SOURCE(https://github.com/google/benchmark/archive/v1.9.0.tar.gz)
+ORIGINAL_SOURCE(https://github.com/google/benchmark/archive/v1.9.1.tar.gz)
 
 LICENSE(Apache-2.0)
 
diff --git a/contrib/restricted/google/benchmark/tools/ya.make b/contrib/restricted/google/benchmark/tools/ya.make
index c010117d648e..6f36ef1416fb 100644
--- a/contrib/restricted/google/benchmark/tools/ya.make
+++ b/contrib/restricted/google/benchmark/tools/ya.make
@@ -1,6 +1,6 @@
 # Generated by devtools/yamaker.
 
-VERSION(1.9.0)
+VERSION(1.9.1)
 
 IF (NOT USE_STL_SYSTEM)
     IF (NOT USE_SYSTEM_PYTHON OR NOT _SYSTEM_PYTHON27)
diff --git a/contrib/restricted/google/benchmark/ya.make b/contrib/restricted/google/benchmark/ya.make
index 73451b6535cf..a38b55df68ab 100644
--- a/contrib/restricted/google/benchmark/ya.make
+++ b/contrib/restricted/google/benchmark/ya.make
@@ -2,9 +2,9 @@
 
 LIBRARY()
 
-VERSION(1.9.0)
+VERSION(1.9.1)
 
-ORIGINAL_SOURCE(https://github.com/google/benchmark/archive/v1.9.0.tar.gz)
+ORIGINAL_SOURCE(https://github.com/google/benchmark/archive/v1.9.1.tar.gz)
 
 LICENSE(Apache-2.0)
 
@@ -21,7 +21,7 @@ NO_UTIL()
 
 CFLAGS(
     GLOBAL -DBENCHMARK_STATIC_DEFINE
-    -DBENCHMARK_VERSION=\"v1.9.0\"
+    -DBENCHMARK_VERSION=\"v1.9.1\"
     -DHAVE_POSIX_REGEX
     -DHAVE_PTHREAD_AFFINITY
     -DHAVE_STD_REGEX

From fefd531d45e1854a567b58f4d6a661d3d365ece1 Mon Sep 17 00:00:00 2001
From: dgolear <dgolear@yandex-team.com>
Date: Fri, 29 Nov 2024 17:33:57 +0300
Subject: [PATCH 02/16] YT: Annotate ts out of range errors with correct error
 code commit_hash:9a961c725ab0976f4a6eb81e301ada80a0da62c3

---
 yt/yt/client/table_client/unversioned_row.cpp | 4 +++-
 1 file changed, 3 insertions(+), 1 deletion(-)

diff --git a/yt/yt/client/table_client/unversioned_row.cpp b/yt/yt/client/table_client/unversioned_row.cpp
index 1dce4fb1b44b..2c71c617f272 100644
--- a/yt/yt/client/table_client/unversioned_row.cpp
+++ b/yt/yt/client/table_client/unversioned_row.cpp
@@ -1274,7 +1274,9 @@ void ValidateReadTimestamp(TTimestamp timestamp)
         timestamp != AsyncLastCommittedTimestamp &&
         (timestamp < MinTimestamp || timestamp > MaxTimestamp))
     {
-        THROW_ERROR_EXCEPTION("Invalid read timestamp %x", timestamp);
+        THROW_ERROR_EXCEPTION(NTableClient::EErrorCode::TimestampOutOfRange,
+            "Invalid read timestamp %x",
+            timestamp);
     }
 }
 

From 2aee7cda6c758679145563340ff94fc652670ea1 Mon Sep 17 00:00:00 2001
From: robot-piglet <robot-piglet@yandex-team.com>
Date: Fri, 29 Nov 2024 18:15:29 +0300
Subject: [PATCH 03/16] Intermediate changes
 commit_hash:60e005cdf76d5bff2a370a6b8f35ef4f6792f414

---
 library/cpp/yt/logging/unittests/ya.make     |    4 -
 library/cpp/yt/memory/unittests/ya.make      |    4 -
 library/cpp/ytalloc/impl/README.md           |    5 -
 library/cpp/ytalloc/impl/bridge.cpp          |  257 -
 library/cpp/ytalloc/impl/core-inl.h          | 4849 ------------------
 library/cpp/ytalloc/impl/ya.make             |   15 -
 yt/yt/client/table_client/unittests/ya.make  |    2 -
 yt/yt/client/unittests/ya.make               |    2 -
 yt/yt/core/actions/unittests/ya.make         |    4 -
 yt/yt/core/bus/unittests/ya.make             |    4 -
 yt/yt/core/compression/unittests/ya.make     |    4 -
 yt/yt/core/concurrency/unittests/ya.make     |    4 -
 yt/yt/core/crypto/unittests/ya.make          |    4 -
 yt/yt/core/http/unittests/ya.make            |    4 -
 yt/yt/core/json/unittests/ya.make            |    4 -
 yt/yt/core/logging/unittests/ya.make         |    4 -
 yt/yt/core/misc/unittests/ya.make            |    4 -
 yt/yt/core/net/unittests/ya.make             |    4 -
 yt/yt/core/profiling/unittests/ya.make       |    4 -
 yt/yt/core/rpc/unittests/main/ya.make        |    4 -
 yt/yt/core/rpc/unittests/rpc_ut.cpp          |   33 -
 yt/yt/core/rpc/unittests/shutdown/ya.make    |    4 -
 yt/yt/core/ya.make                           |    2 +-
 yt/yt/core/ypath/unittests/ya.make           |    4 -
 yt/yt/core/yson/unittests/ya.make            |    4 -
 yt/yt/core/ytree/unittests/ya.make           |    4 -
 yt/yt/library/auth/unittests/ya.make         |    2 -
 yt/yt/library/decimal/unittests/ya.make      |    2 -
 yt/yt/library/erasure/impl/unittests/ya.make |    2 -
 yt/yt/library/process/unittests/ya.make      |    2 -
 yt/yt/library/tvm/service/unittests/ya.make  |    2 -
 31 files changed, 1 insertion(+), 5246 deletions(-)
 delete mode 100644 library/cpp/ytalloc/impl/README.md
 delete mode 100644 library/cpp/ytalloc/impl/bridge.cpp
 delete mode 100644 library/cpp/ytalloc/impl/core-inl.h
 delete mode 100644 library/cpp/ytalloc/impl/ya.make

diff --git a/library/cpp/yt/logging/unittests/ya.make b/library/cpp/yt/logging/unittests/ya.make
index 021b0d09d6af..4baea62140e1 100644
--- a/library/cpp/yt/logging/unittests/ya.make
+++ b/library/cpp/yt/logging/unittests/ya.make
@@ -2,10 +2,6 @@ GTEST(unittester-library-logging)
 
 INCLUDE(${ARCADIA_ROOT}/library/cpp/yt/ya_cpp.make.inc)
 
-IF (NOT OS_WINDOWS)
-    ALLOCATOR(YT)
-ENDIF()
-
 SRCS(
     logger_ut.cpp
     static_analysis_ut.cpp
diff --git a/library/cpp/yt/memory/unittests/ya.make b/library/cpp/yt/memory/unittests/ya.make
index 920873de265a..1a61a130baf7 100644
--- a/library/cpp/yt/memory/unittests/ya.make
+++ b/library/cpp/yt/memory/unittests/ya.make
@@ -2,10 +2,6 @@ GTEST(unittester-library-memory)
 
 INCLUDE(${ARCADIA_ROOT}/library/cpp/yt/ya_cpp.make.inc)
 
-IF (NOT OS_WINDOWS)
-    ALLOCATOR(YT)
-ENDIF()
-
 SRCS(
     atomic_intrusive_ptr_ut.cpp
     chunked_memory_pool_ut.cpp
diff --git a/library/cpp/ytalloc/impl/README.md b/library/cpp/ytalloc/impl/README.md
deleted file mode 100644
index 6d142085a86d..000000000000
--- a/library/cpp/ytalloc/impl/README.md
+++ /dev/null
@@ -1,5 +0,0 @@
-This module contains the actual implementation of YTAlloc. Use
-```
-ALLOCATOR(YT)
-```
-to get it linked into your binary.
diff --git a/library/cpp/ytalloc/impl/bridge.cpp b/library/cpp/ytalloc/impl/bridge.cpp
deleted file mode 100644
index 95a476199827..000000000000
--- a/library/cpp/ytalloc/impl/bridge.cpp
+++ /dev/null
@@ -1,257 +0,0 @@
-#include "core-inl.h"
-
-#include <util/system/compiler.h>
-
-#include <library/cpp/malloc/api/malloc.h>
-
-#include <library/cpp/yt/memory/memory_tag.h>
-
-namespace NYT::NYTAlloc {
-
-////////////////////////////////////////////////////////////////////////////////
-// YTAlloc public API
-
-#ifdef YT_ALLOC_ENABLED
-
-void* Allocate(size_t size)
-{
-    return AllocateInline(size);
-}
-
-void* AllocateSmall(size_t rank)
-{
-    return AllocateSmallInline(rank);
-}
-
-void* AllocatePageAligned(size_t size)
-{
-    return AllocatePageAlignedInline(size);
-}
-
-void Free(void* ptr)
-{
-    FreeInline(ptr);
-}
-
-void FreeNonNull(void* ptr)
-{
-    FreeNonNullInline(ptr);
-}
-
-size_t GetAllocationSize(const void* ptr)
-{
-    return GetAllocationSizeInline(ptr);
-}
-
-size_t GetAllocationSize(size_t size)
-{
-    return GetAllocationSizeInline(size);
-}
-
-#endif
-
-////////////////////////////////////////////////////////////////////////////////
-
-} // namespace NYT::NYTAlloc
-
-namespace NYT {
-
-using namespace NYTAlloc;
-
-////////////////////////////////////////////////////////////////////////////////
-// Memory tags API bridge
-
-TMemoryTag GetCurrentMemoryTag()
-{
-    return NYTAlloc::TThreadManager::GetCurrentMemoryTag();
-}
-
-void SetCurrentMemoryTag(TMemoryTag tag)
-{
-    TThreadManager::SetCurrentMemoryTag(tag);
-}
-
-void GetMemoryUsageForTags(const TMemoryTag* tags, size_t count, size_t* results)
-{
-    InitializeGlobals();
-    StatisticsManager->GetTaggedMemoryUsage(tags, count, results);
-}
-
-size_t GetMemoryUsageForTag(TMemoryTag tag)
-{
-    size_t result;
-    GetMemoryUsageForTags(&tag, 1, &result);
-    return result;
-}
-
-////////////////////////////////////////////////////////////////////////////////
-
-} // namespace NYT
-
-namespace NYT::NYTAlloc {
-
-////////////////////////////////////////////////////////////////////////////////
-// Memory zone API bridge
-
-void SetCurrentMemoryZone(EMemoryZone zone)
-{
-    TThreadManager::SetCurrentMemoryZone(zone);
-}
-
-EMemoryZone GetCurrentMemoryZone()
-{
-    return TThreadManager::GetCurrentMemoryZone();
-}
-
-EMemoryZone GetAllocationMemoryZone(const void* ptr)
-{
-    auto uintptr = reinterpret_cast<uintptr_t>(ptr);
-    if (uintptr >= MinUntaggedSmallPtr && uintptr < MaxUntaggedSmallPtr ||
-        uintptr >= MinTaggedSmallPtr && uintptr < MaxTaggedSmallPtr ||
-        uintptr >= DumpableLargeZoneStart && uintptr < DumpableLargeZoneEnd)
-    {
-        return EMemoryZone::Normal;
-    } else if (uintptr >= UndumpableLargeZoneStart && uintptr < UndumpableLargeZoneEnd) {
-        return EMemoryZone::Undumpable;
-    } else {
-        return EMemoryZone::Unknown;
-    }
-}
-
-////////////////////////////////////////////////////////////////////////////////
-// Fiber id API
-
-void SetCurrentFiberId(TFiberId id)
-{
-    TThreadManager::SetCurrentFiberId(id);
-}
-
-TFiberId GetCurrentFiberId()
-{
-    return TThreadManager::GetCurrentFiberId();
-}
-
-////////////////////////////////////////////////////////////////////////////////
-
-} // namespace NYT::NYTAlloc
-
-////////////////////////////////////////////////////////////////////////////////
-// Libc malloc bridge
-
-#ifdef YT_ALLOC_ENABLED
-
-using namespace NYT::NYTAlloc;
-
-extern "C" void* malloc(size_t size)
-{
-    return AllocateInline(size);
-}
-
-extern "C" void* valloc(size_t size)
-{
-    return AllocatePageAlignedInline(size);
-}
-
-extern "C" void* aligned_alloc(size_t alignment, size_t size)
-{
-    // Alignment must be a power of two.
-    Y_ABORT_UNLESS((alignment & (alignment - 1)) == 0);
-    // Alignment must not exceed the page size.
-    Y_ABORT_UNLESS(alignment <= PageSize);
-    if (alignment <= 16) {
-        // Proper alignment here is automatic.
-        return Allocate(size);
-    } else {
-        return AllocatePageAligned(size);
-    }
-}
-
-extern "C" void* pvalloc(size_t size)
-{
-    return valloc(AlignUp(size, PageSize));
-}
-
-extern "C" int posix_memalign(void** ptrPtr, size_t alignment, size_t size)
-{
-    *ptrPtr = aligned_alloc(alignment, size);
-    return 0;
-}
-
-extern "C" void* memalign(size_t alignment, size_t size)
-{
-    return aligned_alloc(alignment, size);
-}
-
-extern "C" void* __libc_memalign(size_t alignment, size_t size)
-{
-    return aligned_alloc(alignment, size);
-}
-
-extern "C" void free(void* ptr)
-{
-    FreeInline(ptr);
-}
-
-extern "C" void* calloc(size_t n, size_t elemSize)
-{
-    // Overflow check.
-    auto size = n * elemSize;
-    if (elemSize != 0 && size / elemSize != n) {
-        return nullptr;
-    }
-
-    void* result = Allocate(size);
-    ::memset(result, 0, size);
-    return result;
-}
-
-extern "C" void cfree(void* ptr)
-{
-    Free(ptr);
-}
-
-extern "C" void* realloc(void* oldPtr, size_t newSize)
-{
-    if (!oldPtr) {
-        return Allocate(newSize);
-    }
-
-    if (newSize == 0) {
-        Free(oldPtr);
-        return nullptr;
-    }
-
-    void* newPtr = Allocate(newSize);
-    size_t oldSize = GetAllocationSize(oldPtr);
-    ::memcpy(newPtr, oldPtr, std::min(oldSize, newSize));
-    Free(oldPtr);
-    return newPtr;
-}
-
-extern "C" size_t malloc_usable_size(void* ptr) noexcept
-{
-    return GetAllocationSize(ptr);
-}
-
-extern "C" size_t nallocx(size_t size, int /* flags */) noexcept
-{
-    return GetAllocationSize(size);
-}
-
-#endif
-
-namespace NMalloc {
-
-////////////////////////////////////////////////////////////////////////////////
-// Arcadia malloc API bridge
-
-TMallocInfo MallocInfo()
-{
-    TMallocInfo info;
-    info.Name = "ytalloc";
-    return info;
-}
-
-////////////////////////////////////////////////////////////////////////////////
-
-} // namespace NMalloc
diff --git a/library/cpp/ytalloc/impl/core-inl.h b/library/cpp/ytalloc/impl/core-inl.h
deleted file mode 100644
index 5a4f6a260b8c..000000000000
--- a/library/cpp/ytalloc/impl/core-inl.h
+++ /dev/null
@@ -1,4849 +0,0 @@
-#pragma once
-
-// This file contains the core parts of YTAlloc but no malloc/free-bridge.
-// The latter bridge is placed into alloc.cpp, which includes (sic!) core-inl.h.
-// This ensures that AllocateInline/FreeInline calls are properly inlined into malloc/free.
-// Also core-inl.h can be directly included in, e.g., benchmarks.
-
-#include <library/cpp/yt/containers/intrusive_linked_list.h>
-
-#include <library/cpp/yt/memory/memory_tag.h>
-
-#include <library/cpp/yt/threading/at_fork.h>
-#include <library/cpp/yt/threading/fork_aware_spin_lock.h>
-
-#include <library/cpp/yt/memory/free_list.h>
-
-#include <util/system/tls.h>
-#include <util/system/align.h>
-#include <util/system/thread.h>
-
-#include <util/string/printf.h>
-
-#include <util/generic/singleton.h>
-#include <util/generic/size_literals.h>
-#include <util/generic/utility.h>
-
-#include <util/digest/numeric.h>
-
-#include <library/cpp/ytalloc/api/ytalloc.h>
-
-#include <atomic>
-#include <array>
-#include <vector>
-#include <mutex>
-#include <thread>
-#include <condition_variable>
-#include <cstdio>
-#include <optional>
-
-#include <sys/mman.h>
-
-#ifdef _linux_
-    #include <sys/utsname.h>
-#endif
-
-#include <errno.h>
-#include <pthread.h>
-#include <time.h>
-
-#ifndef MAP_POPULATE
-    #define MAP_POPULATE 0x08000
-#endif
-
-// MAP_FIXED which doesn't unmap underlying mapping.
-// Linux kernels older than 4.17 silently ignore this flag.
-#ifndef MAP_FIXED_NOREPLACE
-    #ifdef _linux_
-        #define MAP_FIXED_NOREPLACE 0x100000
-    #else
-        #define MAP_FIXED_NOREPLACE 0
-    #endif
-#endif
-
-#ifndef MADV_POPULATE
-    #define MADV_POPULATE 0x59410003
-#endif
-
-#ifndef MADV_STOCKPILE
-    #define MADV_STOCKPILE 0x59410004
-#endif
-
-#ifndef MADV_FREE
-    #define MADV_FREE 8
-#endif
-
-#ifndef MADV_DONTDUMP
-    #define MADV_DONTDUMP 16
-#endif
-
-#ifndef NDEBUG
-    #define YTALLOC_PARANOID
-#endif
-
-#ifdef YTALLOC_PARANOID
-    #define YTALLOC_NERVOUS
-#endif
-
-#define YTALLOC_VERIFY(condition)                                                             \
-    do {                                                                                      \
-        if (Y_UNLIKELY(!(condition))) {                                                       \
-            ::NYT::NYTAlloc::AssertTrap("Assertion failed: " #condition, __FILE__, __LINE__); \
-        }                                                                                     \
-    } while (false)
-
-#ifdef NDEBUG
-    #define YTALLOC_ASSERT(condition) YTALLOC_VERIFY(condition)
-#else
-    #define YTALLOC_ASSERT(condition) (void)(0)
-#endif
-
-#ifdef YTALLOC_PARANOID
-    #define YTALLOC_PARANOID_ASSERT(condition) YTALLOC_VERIFY(condition)
-#else
-    #define YTALLOC_PARANOID_ASSERT(condition) (true || (condition))
-#endif
-
-#define YTALLOC_TRAP(message) ::NYT::NYTAlloc::AssertTrap(message, __FILE__, __LINE__)
-
-namespace NYT::NYTAlloc {
-
-////////////////////////////////////////////////////////////////////////////////
-// Allocations are classified into three types:
-//
-// a) Small chunks (less than LargeAllocationSizeThreshold)
-// These are the fastest and are extensively cached (both per-thread and globally).
-// Memory claimed for these allocations is never reclaimed back.
-// Code dealing with such allocations is heavy optimized with all hot paths
-// as streamlined as possible. The implementation is mostly inspired by LFAlloc.
-//
-// b) Large blobs (from LargeAllocationSizeThreshold to HugeAllocationSizeThreshold)
-// These are cached as well. We expect such allocations to be less frequent
-// than small ones but still do our best to provide good scalability.
-// In particular, thread-sharded concurrent data structures as used to provide access to
-// cached blobs. Memory is claimed via madvise(MADV_POPULATE) and reclaimed back
-// via madvise(MADV_FREE).
-//
-// c) Huge blobs (from HugeAllocationSizeThreshold)
-// These should be rare; we delegate directly to mmap and munmap for each allocation.
-//
-// We also provide a separate allocator for all system allocations (that are needed by YTAlloc itself).
-// These are rare and also delegate to mmap/unmap.
-
-// Periods between background activities.
-constexpr auto BackgroundInterval = TDuration::Seconds(1);
-
-static_assert(LargeRankCount - MinLargeRank <= 16, "Too many large ranks");
-static_assert(SmallRankCount <= 32, "Too many small ranks");
-
-constexpr size_t SmallZoneSize = 1_TB;
-constexpr size_t LargeZoneSize = 16_TB;
-constexpr size_t HugeZoneSize = 1_TB;
-constexpr size_t SystemZoneSize = 1_TB;
-
-constexpr size_t MaxCachedChunksPerRank = 256;
-
-constexpr uintptr_t UntaggedSmallZonesStart = 0;
-constexpr uintptr_t UntaggedSmallZonesEnd = UntaggedSmallZonesStart + 32 * SmallZoneSize;
-constexpr uintptr_t MinUntaggedSmallPtr = UntaggedSmallZonesStart + SmallZoneSize * 1;
-constexpr uintptr_t MaxUntaggedSmallPtr = UntaggedSmallZonesStart + SmallZoneSize * SmallRankCount;
-
-constexpr uintptr_t TaggedSmallZonesStart = UntaggedSmallZonesEnd;
-constexpr uintptr_t TaggedSmallZonesEnd = TaggedSmallZonesStart + 32 * SmallZoneSize;
-constexpr uintptr_t MinTaggedSmallPtr = TaggedSmallZonesStart + SmallZoneSize * 1;
-constexpr uintptr_t MaxTaggedSmallPtr = TaggedSmallZonesStart + SmallZoneSize * SmallRankCount;
-
-constexpr uintptr_t DumpableLargeZoneStart = TaggedSmallZonesEnd;
-constexpr uintptr_t DumpableLargeZoneEnd = DumpableLargeZoneStart + LargeZoneSize;
-
-constexpr uintptr_t UndumpableLargeZoneStart = DumpableLargeZoneEnd;
-constexpr uintptr_t UndumpableLargeZoneEnd = UndumpableLargeZoneStart + LargeZoneSize;
-
-constexpr uintptr_t LargeZoneStart(bool dumpable)
-{
-    return dumpable ? DumpableLargeZoneStart : UndumpableLargeZoneStart;
-}
-constexpr uintptr_t LargeZoneEnd(bool dumpable)
-{
-    return dumpable ? DumpableLargeZoneEnd : UndumpableLargeZoneEnd;
-}
-
-constexpr uintptr_t HugeZoneStart = UndumpableLargeZoneEnd;
-constexpr uintptr_t HugeZoneEnd = HugeZoneStart + HugeZoneSize;
-
-constexpr uintptr_t SystemZoneStart = HugeZoneEnd;
-constexpr uintptr_t SystemZoneEnd = SystemZoneStart + SystemZoneSize;
-
-// We leave 64_KB at the end of 256_MB block and never use it.
-// That serves two purposes:
-//   1. SmallExtentSize % SmallSegmentSize == 0
-//   2. Every small object satisfies RightReadableArea requirement.
-constexpr size_t SmallExtentAllocSize = 256_MB;
-constexpr size_t SmallExtentSize = SmallExtentAllocSize - 64_KB;
-constexpr size_t SmallSegmentSize = 96_KB; // LCM(SmallRankToSize)
-
-constexpr ui16 SmallRankBatchSize[SmallRankCount] = {
-    0, 256, 256, 256, 256, 256, 256, 256, 256, 256, 192, 128, 96, 64, 48, 32, 24, 16, 12, 8, 6, 4, 3
-};
-
-constexpr bool CheckSmallSizes()
-{
-    for (size_t rank = 0; rank < SmallRankCount; rank++) {
-        auto size = SmallRankToSize[rank];
-        if (size == 0) {
-            continue;
-        }
-
-        if (SmallSegmentSize % size != 0) {
-            return false;
-        }
-
-        if (SmallRankBatchSize[rank] > MaxCachedChunksPerRank) {
-            return false;
-        }
-    }
-
-    return true;
-}
-
-static_assert(CheckSmallSizes());
-static_assert(SmallExtentSize % SmallSegmentSize == 0);
-static_assert(SmallSegmentSize % PageSize == 0);
-
-constexpr size_t LargeExtentSize = 1_GB;
-static_assert(LargeExtentSize >= LargeAllocationSizeThreshold, "LargeExtentSize < LargeAllocationSizeThreshold");
-
-constexpr const char* BackgroundThreadName = "YTAllocBack";
-constexpr const char* StockpileThreadName = "YTAllocStock";
-
-DEFINE_ENUM(EAllocationKind,
-    (Untagged)
-    (Tagged)
-);
-
-// Forward declarations.
-struct TThreadState;
-struct TLargeArena;
-struct TLargeBlobExtent;
-
-////////////////////////////////////////////////////////////////////////////////
-// Traps and assertions
-
-[[noreturn]]
-void OomTrap()
-{
-    _exit(9);
-}
-
-[[noreturn]]
-void AssertTrap(const char* message, const char* file, int line)
-{
-    ::fprintf(stderr, "*** YTAlloc has detected an internal trap at %s:%d\n*** %s\n",
-        file,
-        line,
-        message);
-    __builtin_trap();
-}
-
-template <class T, class E>
-void AssertBlobState(T* header, E expectedState)
-{
-    auto actualState = header->State;
-    if (Y_UNLIKELY(actualState != expectedState)) {
-        char message[256];
-        snprintf(message, sizeof(message), "Invalid blob header state at %p: expected %" PRIx64 ", actual %" PRIx64,
-            header,
-            static_cast<ui64>(expectedState),
-            static_cast<ui64>(actualState));
-        YTALLOC_TRAP(message);
-    }
-}
-
-////////////////////////////////////////////////////////////////////////////////
-
-// Provides a never-dying singleton with explicit construction.
-template <class T>
-class TExplicitlyConstructableSingleton
-{
-public:
-    TExplicitlyConstructableSingleton()
-    { }
-
-    ~TExplicitlyConstructableSingleton()
-    { }
-
-    template <class... Ts>
-    void Construct(Ts&&... args)
-    {
-        new (&Storage_) T(std::forward<Ts>(args)...);
-#ifndef NDEBUG
-        Constructed_ = true;
-#endif
-    }
-
-    Y_FORCE_INLINE T* Get()
-    {
-#ifndef NDEBUG
-        YTALLOC_PARANOID_ASSERT(Constructed_);
-#endif
-        return &Storage_;
-    }
-
-    Y_FORCE_INLINE const T* Get() const
-    {
-#ifndef NDEBUG
-        YTALLOC_PARANOID_ASSERT(Constructed_);
-#endif
-        return &Storage_;
-    }
-
-    Y_FORCE_INLINE T* operator->()
-    {
-        return Get();
-    }
-
-    Y_FORCE_INLINE const T* operator->() const
-    {
-        return Get();
-    }
-
-    Y_FORCE_INLINE T& operator*()
-    {
-        return *Get();
-    }
-
-    Y_FORCE_INLINE const T& operator*() const
-    {
-        return *Get();
-    }
-
-private:
-    union {
-        T Storage_;
-    };
-
-#ifndef NDEBUG
-    bool Constructed_;
-#endif
-};
-
-////////////////////////////////////////////////////////////////////////////////
-
-// Initializes all singletons.
-// Safe to call multiple times.
-// Guaranteed to not allocate.
-void InitializeGlobals();
-
-// Spawns the background thread, if it's time.
-// Safe to call multiple times.
-// Must be called on allocation slow path.
-void StartBackgroundThread();
-
-////////////////////////////////////////////////////////////////////////////////
-
-class TLogManager
-{
-public:
-    // Sets the handler to be invoked for each log event produced by YTAlloc.
-    void EnableLogging(TLogHandler logHandler)
-    {
-        LogHandler_.store(logHandler);
-    }
-
-    // Checks (in a racy way) that logging is enabled.
-    bool IsLoggingEnabled()
-    {
-        return LogHandler_.load() != nullptr;
-    }
-
-    // Logs the message via log handler (if any).
-    template <class... Ts>
-    void LogMessage(ELogEventSeverity severity, const char* format, Ts&&... args)
-    {
-        auto logHandler = LogHandler_.load();
-        if (!logHandler) {
-            return;
-        }
-
-        std::array<char, 16_KB> buffer;
-        auto len = ::snprintf(buffer.data(), buffer.size(), format, std::forward<Ts>(args)...);
-
-        TLogEvent event;
-        event.Severity = severity;
-        event.Message = TStringBuf(buffer.data(), len);
-        logHandler(event);
-    }
-
-    // A special case of zero args.
-    void LogMessage(ELogEventSeverity severity, const char* message)
-    {
-        LogMessage(severity, "%s", message);
-    }
-
-private:
-    std::atomic<TLogHandler> LogHandler_= nullptr;
-
-};
-
-TExplicitlyConstructableSingleton<TLogManager> LogManager;
-
-#define YTALLOC_LOG_EVENT(...)   LogManager->LogMessage(__VA_ARGS__)
-#define YTALLOC_LOG_DEBUG(...)   YTALLOC_LOG_EVENT(ELogEventSeverity::Debug, __VA_ARGS__)
-#define YTALLOC_LOG_INFO(...)    YTALLOC_LOG_EVENT(ELogEventSeverity::Info, __VA_ARGS__)
-#define YTALLOC_LOG_WARNING(...) YTALLOC_LOG_EVENT(ELogEventSeverity::Warning, __VA_ARGS__)
-#define YTALLOC_LOG_ERROR(...)   YTALLOC_LOG_EVENT(ELogEventSeverity::Error, __VA_ARGS__)
-
-////////////////////////////////////////////////////////////////////////////////
-
-Y_FORCE_INLINE size_t GetUsed(ssize_t allocated, ssize_t freed)
-{
-    return allocated >= freed ? static_cast<size_t>(allocated - freed) : 0;
-}
-
-template <class T>
-Y_FORCE_INLINE void* HeaderToPtr(T* header)
-{
-    return header + 1;
-}
-
-template <class T>
-Y_FORCE_INLINE T* PtrToHeader(void* ptr)
-{
-    return static_cast<T*>(ptr) - 1;
-}
-
-template <class T>
-Y_FORCE_INLINE const T* PtrToHeader(const void* ptr)
-{
-    return static_cast<const T*>(ptr) - 1;
-}
-
-Y_FORCE_INLINE size_t PtrToSmallRank(const void* ptr)
-{
-    return (reinterpret_cast<uintptr_t>(ptr) >> 40) & 0x1f;
-}
-
-Y_FORCE_INLINE char* AlignDownToSmallSegment(char* extent, char* ptr)
-{
-    auto offset = static_cast<uintptr_t>(ptr - extent);
-    // NB: This modulo operation is always performed using multiplication.
-    offset -= (offset % SmallSegmentSize);
-    return extent + offset;
-}
-
-Y_FORCE_INLINE char* AlignUpToSmallSegment(char* extent, char* ptr)
-{
-    return AlignDownToSmallSegment(extent, ptr + SmallSegmentSize - 1);
-}
-
-template <class T>
-static Y_FORCE_INLINE void UnalignPtr(void*& ptr)
-{
-    if (reinterpret_cast<uintptr_t>(ptr) % PageSize == 0) {
-        reinterpret_cast<char*&>(ptr) -= PageSize - sizeof (T);
-    }
-    YTALLOC_PARANOID_ASSERT(reinterpret_cast<uintptr_t>(ptr) % PageSize == sizeof (T));
-}
-
-template <class T>
-static Y_FORCE_INLINE void UnalignPtr(const void*& ptr)
-{
-    if (reinterpret_cast<uintptr_t>(ptr) % PageSize == 0) {
-        reinterpret_cast<const char*&>(ptr) -= PageSize - sizeof (T);
-    }
-    YTALLOC_PARANOID_ASSERT(reinterpret_cast<uintptr_t>(ptr) % PageSize == sizeof (T));
-}
-
-template <class T>
-Y_FORCE_INLINE size_t GetRawBlobSize(size_t size)
-{
-    return AlignUp(size + sizeof (T) + RightReadableAreaSize, PageSize);
-}
-
-template <class T>
-Y_FORCE_INLINE size_t GetBlobAllocationSize(size_t size)
-{
-    size += sizeof(T);
-    size += RightReadableAreaSize;
-    size = AlignUp(size, PageSize);
-    size -= sizeof(T);
-    size -= RightReadableAreaSize;
-    return size;
-}
-
-Y_FORCE_INLINE size_t GetLargeRank(size_t size)
-{
-    size_t rank = 64 - __builtin_clzl(size);
-    if (size == (1ULL << (rank - 1))) {
-        --rank;
-    }
-    return rank;
-}
-
-Y_FORCE_INLINE void PoisonRange(void* ptr, size_t size, ui32 magic)
-{
-#ifdef YTALLOC_PARANOID
-    size = ::AlignUp<size_t>(size, 4);
-    std::fill(static_cast<ui32*>(ptr), static_cast<ui32*>(ptr) + size / 4, magic);
-#else
-    Y_UNUSED(ptr);
-    Y_UNUSED(size);
-    Y_UNUSED(magic);
-#endif
-}
-
-Y_FORCE_INLINE void PoisonFreedRange(void* ptr, size_t size)
-{
-    PoisonRange(ptr, size, 0xdeadbeef);
-}
-
-Y_FORCE_INLINE void PoisonUninitializedRange(void* ptr, size_t size)
-{
-    PoisonRange(ptr, size, 0xcafebabe);
-}
-
-// Checks that the header size is divisible by 16 (as needed due to alignment restrictions).
-#define CHECK_HEADER_ALIGNMENT(T) static_assert(sizeof(T) % 16 == 0, "sizeof(" #T ") % 16 != 0");
-
-////////////////////////////////////////////////////////////////////////////////
-
-static_assert(sizeof(TFreeList<void>) == CacheLineSize, "sizeof(TFreeList) != CacheLineSize");
-
-////////////////////////////////////////////////////////////////////////////////
-
-constexpr size_t ShardCount = 16;
-std::atomic<size_t> GlobalCurrentShardIndex;
-
-// Provides a context for working with sharded data structures.
-// Captures the initial shard index upon construction (indicating the shard
-// where all insertions go). Maintains the current shard index (round-robin,
-// indicating the shard currently used for extraction).
-// Can be or be not thread-safe depending on TCounter.
-template <class TCounter>
-class TShardedState
-{
-public:
-    TShardedState()
-        : InitialShardIndex_(GlobalCurrentShardIndex++ % ShardCount)
-        , CurrentShardIndex_(InitialShardIndex_)
-    { }
-
-    Y_FORCE_INLINE size_t GetInitialShardIndex() const
-    {
-        return InitialShardIndex_;
-    }
-
-    Y_FORCE_INLINE size_t GetNextShardIndex()
-    {
-        return ++CurrentShardIndex_ % ShardCount;
-    }
-
-private:
-    const size_t InitialShardIndex_;
-    TCounter CurrentShardIndex_;
-};
-
-using TLocalShardedState = TShardedState<size_t>;
-using TGlobalShardedState = TShardedState<std::atomic<size_t>>;
-
-// Implemented as a collection of free lists (each called a shard).
-// One needs TShardedState to access the sharded data structure.
-template <class T>
-class TShardedFreeList
-{
-public:
-    // First tries to extract an item from the initial shard;
-    // if failed then proceeds to all shards in round-robin fashion.
-    template <class TState>
-    T* Extract(TState* state)
-    {
-        if (auto* item = Shards_[state->GetInitialShardIndex()].Extract()) {
-            return item;
-        }
-        return ExtractRoundRobin(state);
-    }
-
-    // Attempts to extract an item from all shards in round-robin fashion.
-    template <class TState>
-    T* ExtractRoundRobin(TState* state)
-    {
-       for (size_t index = 0; index < ShardCount; ++index) {
-            if (auto* item = Shards_[state->GetNextShardIndex()].Extract()) {
-                return item;
-            }
-        }
-        return nullptr;
-    }
-
-    // Extracts items from all shards linking them together.
-    T* ExtractAll()
-    {
-        T* head = nullptr;
-        T* tail = nullptr;
-        for (auto& shard : Shards_) {
-            auto* item = shard.ExtractAll();
-            if (!head) {
-                head = item;
-            }
-            if (tail) {
-                YTALLOC_PARANOID_ASSERT(!tail->Next);
-                tail->Next = item;
-            } else {
-                tail = item;
-            }
-            while (tail && tail->Next) {
-                tail = tail->Next;
-            }
-        }
-        return head;
-    }
-
-    template <class TState>
-    void Put(TState* state, T* item)
-    {
-        Shards_[state->GetInitialShardIndex()].Put(item);
-    }
-
-private:
-    std::array<TFreeList<T>, ShardCount> Shards_;
-};
-
-////////////////////////////////////////////////////////////////////////////////
-
-// Holds YTAlloc control knobs.
-// Thread safe.
-class TConfigurationManager
-{
-public:
-    void SetLargeUnreclaimableCoeff(double value)
-    {
-        LargeUnreclaimableCoeff_.store(value);
-    }
-
-    double GetLargeUnreclaimableCoeff() const
-    {
-        return LargeUnreclaimableCoeff_.load(std::memory_order_relaxed);
-    }
-
-
-    void SetMinLargeUnreclaimableBytes(size_t value)
-    {
-        MinLargeUnreclaimableBytes_.store(value);
-    }
-
-    void SetMaxLargeUnreclaimableBytes(size_t value)
-    {
-        MaxLargeUnreclaimableBytes_.store(value);
-    }
-
-    size_t GetMinLargeUnreclaimableBytes() const
-    {
-        return MinLargeUnreclaimableBytes_.load(std::memory_order_relaxed);
-    }
-
-    size_t GetMaxLargeUnreclaimableBytes() const
-    {
-        return MaxLargeUnreclaimableBytes_.load(std::memory_order_relaxed);
-    }
-
-
-    void SetTimingEventThreshold(TDuration value)
-    {
-        TimingEventThresholdNs_.store(value.MicroSeconds() * 1000);
-    }
-
-    i64 GetTimingEventThresholdNs() const
-    {
-        return TimingEventThresholdNs_.load(std::memory_order_relaxed);
-    }
-
-
-    void SetAllocationProfilingEnabled(bool value);
-
-    bool IsAllocationProfilingEnabled() const
-    {
-        return AllocationProfilingEnabled_.load();
-    }
-
-
-    Y_FORCE_INLINE bool GetAllocationProfilingSamplingRate()
-    {
-        return AllocationProfilingSamplingRate_.load();
-    }
-
-    void SetAllocationProfilingSamplingRate(double rate)
-    {
-        if (rate < 0) {
-            rate = 0;
-        }
-        if (rate > 1) {
-            rate = 1;
-        }
-        i64 rateX64K = static_cast<i64>(rate * (1ULL << 16));
-        AllocationProfilingSamplingRateX64K_.store(ClampVal<ui32>(rateX64K, 0, std::numeric_limits<ui16>::max() + 1));
-        AllocationProfilingSamplingRate_.store(rate);
-    }
-
-
-    Y_FORCE_INLINE bool IsSmallArenaAllocationProfilingEnabled(size_t rank)
-    {
-        return SmallArenaAllocationProfilingEnabled_[rank].load(std::memory_order_relaxed);
-    }
-
-    Y_FORCE_INLINE bool IsSmallArenaAllocationProfiled(size_t rank)
-    {
-        return IsSmallArenaAllocationProfilingEnabled(rank) && IsAllocationSampled();
-    }
-
-    void SetSmallArenaAllocationProfilingEnabled(size_t rank, bool value)
-    {
-        if (rank >= SmallRankCount) {
-            return;
-        }
-        SmallArenaAllocationProfilingEnabled_[rank].store(value);
-    }
-
-
-    Y_FORCE_INLINE bool IsLargeArenaAllocationProfilingEnabled(size_t rank)
-    {
-        return LargeArenaAllocationProfilingEnabled_[rank].load(std::memory_order_relaxed);
-    }
-
-    Y_FORCE_INLINE bool IsLargeArenaAllocationProfiled(size_t rank)
-    {
-        return IsLargeArenaAllocationProfilingEnabled(rank) && IsAllocationSampled();
-    }
-
-    void SetLargeArenaAllocationProfilingEnabled(size_t rank, bool value)
-    {
-        if (rank >= LargeRankCount) {
-            return;
-        }
-        LargeArenaAllocationProfilingEnabled_[rank].store(value);
-    }
-
-
-    Y_FORCE_INLINE int GetProfilingBacktraceDepth()
-    {
-        return ProfilingBacktraceDepth_.load();
-    }
-
-    void SetProfilingBacktraceDepth(int depth)
-    {
-        if (depth < 1) {
-            return;
-        }
-        if (depth > MaxAllocationProfilingBacktraceDepth) {
-            depth = MaxAllocationProfilingBacktraceDepth;
-        }
-        ProfilingBacktraceDepth_.store(depth);
-    }
-
-
-    Y_FORCE_INLINE size_t GetMinProfilingBytesUsedToReport()
-    {
-        return MinProfilingBytesUsedToReport_.load();
-    }
-
-    void SetMinProfilingBytesUsedToReport(size_t size)
-    {
-        MinProfilingBytesUsedToReport_.store(size);
-    }
-
-    void SetEnableEagerMemoryRelease(bool value)
-    {
-        EnableEagerMemoryRelease_.store(value);
-    }
-
-    bool GetEnableEagerMemoryRelease()
-    {
-        return EnableEagerMemoryRelease_.load(std::memory_order_relaxed);
-    }
-
-    void SetEnableMadvisePopulate(bool value)
-    {
-        EnableMadvisePopulate_.store(value);
-    }
-
-    bool GetEnableMadvisePopulate()
-    {
-        return EnableMadvisePopulate_.load(std::memory_order_relaxed);
-    }
-
-    void EnableStockpile()
-    {
-        StockpileEnabled_.store(true);
-    }
-
-    bool IsStockpileEnabled()
-    {
-        return StockpileEnabled_.load();
-    }
-
-    void SetStockpileInterval(TDuration value)
-    {
-        StockpileInterval_.store(value);
-    }
-
-    TDuration GetStockpileInterval()
-    {
-        return StockpileInterval_.load();
-    }
-
-    void SetStockpileThreadCount(int count)
-    {
-        StockpileThreadCount_.store(count);
-    }
-
-    int GetStockpileThreadCount()
-    {
-        return ClampVal(StockpileThreadCount_.load(), 0, MaxStockpileThreadCount);
-    }
-
-    void SetStockpileSize(size_t value)
-    {
-        StockpileSize_.store(value);
-    }
-
-    size_t GetStockpileSize()
-    {
-        return StockpileSize_.load();
-    }
-
-private:
-    std::atomic<double> LargeUnreclaimableCoeff_ = 0.05;
-    std::atomic<size_t> MinLargeUnreclaimableBytes_ = 128_MB;
-    std::atomic<size_t> MaxLargeUnreclaimableBytes_ = 10_GB;
-    std::atomic<i64> TimingEventThresholdNs_ = 10000000; // in ns, 10 ms by default
-
-    std::atomic<bool> AllocationProfilingEnabled_ = false;
-    std::atomic<double> AllocationProfilingSamplingRate_ = 1.0;
-    std::atomic<ui32> AllocationProfilingSamplingRateX64K_ = std::numeric_limits<ui32>::max();
-    std::array<std::atomic<bool>, SmallRankCount> SmallArenaAllocationProfilingEnabled_ = {};
-    std::array<std::atomic<bool>, LargeRankCount> LargeArenaAllocationProfilingEnabled_ = {};
-    std::atomic<int> ProfilingBacktraceDepth_ = 10;
-    std::atomic<size_t> MinProfilingBytesUsedToReport_ = 1_MB;
-
-    std::atomic<bool> EnableEagerMemoryRelease_ = true;
-    std::atomic<bool> EnableMadvisePopulate_ = false;
-
-    std::atomic<bool> StockpileEnabled_ = false;
-    std::atomic<TDuration> StockpileInterval_ = TDuration::MilliSeconds(10);
-    static constexpr int MaxStockpileThreadCount = 8;
-    std::atomic<int> StockpileThreadCount_ = 4;
-    std::atomic<size_t> StockpileSize_ = 1_GB;
-
-private:
-    bool IsAllocationSampled()
-    {
-        Y_POD_STATIC_THREAD(ui16) Counter;
-        return Counter++ < AllocationProfilingSamplingRateX64K_.load();
-    }
-};
-
-TExplicitlyConstructableSingleton<TConfigurationManager> ConfigurationManager;
-
-////////////////////////////////////////////////////////////////////////////////
-
-template <class TEvent, class TManager>
-class TEventLogManagerBase
-{
-public:
-    void DisableForCurrentThread()
-    {
-        TManager::DisabledForCurrentThread_ = true;
-    }
-
-    template <class... TArgs>
-    void EnqueueEvent(TArgs&&... args)
-    {
-        if (TManager::DisabledForCurrentThread_) {
-            return;
-        }
-
-        auto timestamp = TInstant::Now();
-        auto fiberId = NYTAlloc::GetCurrentFiberId();
-        auto guard = Guard(EventLock_);
-
-        auto event = TEvent(args...);
-        OnEvent(event);
-
-        if (EventCount_ >= EventBufferSize) {
-            return;
-        }
-
-        auto& enqueuedEvent = Events_[EventCount_++];
-        enqueuedEvent = std::move(event);
-        enqueuedEvent.Timestamp = timestamp;
-        enqueuedEvent.FiberId = fiberId;
-    }
-
-    void RunBackgroundTasks()
-    {
-        if (LogManager->IsLoggingEnabled()) {
-            for (const auto& event : PullEvents()) {
-                ProcessEvent(event);
-            }
-        }
-    }
-
-protected:
-    NThreading::TForkAwareSpinLock EventLock_;
-
-    virtual void OnEvent(const TEvent& event) = 0;
-
-    virtual void ProcessEvent(const TEvent& event) = 0;
-
-private:
-    static constexpr size_t EventBufferSize = 1000;
-    size_t EventCount_ = 0;
-    std::array<TEvent, EventBufferSize> Events_;
-
-    std::vector<TEvent> PullEvents()
-    {
-        std::vector<TEvent> events;
-        events.reserve(EventBufferSize);
-
-        auto guard = Guard(EventLock_);
-        for (size_t index = 0; index < EventCount_; ++index) {
-            events.push_back(Events_[index]);
-        }
-        EventCount_ = 0;
-        return events;
-    }
-};
-
-////////////////////////////////////////////////////////////////////////////////
-
-struct TTimingEvent
-{
-    ETimingEventType Type;
-    TDuration Duration;
-    size_t Size;
-    TInstant Timestamp;
-    TFiberId FiberId;
-
-    TTimingEvent()
-    { }
-
-    TTimingEvent(
-        ETimingEventType type,
-        TDuration duration,
-        size_t size)
-        : Type(type)
-        , Duration(duration)
-        , Size(size)
-    { }
-};
-
-class TTimingManager
-    : public TEventLogManagerBase<TTimingEvent, TTimingManager>
-{
-public:
-    TEnumIndexedArray<ETimingEventType, TTimingEventCounters> GetTimingEventCounters()
-    {
-        auto guard = Guard(EventLock_);
-        return EventCounters_;
-    }
-
-private:
-    TEnumIndexedArray<ETimingEventType, TTimingEventCounters> EventCounters_;
-
-    Y_POD_STATIC_THREAD(bool) DisabledForCurrentThread_;
-
-    friend class TEventLogManagerBase<TTimingEvent, TTimingManager>;
-
-    virtual void OnEvent(const TTimingEvent& event) override
-    {
-        auto& counters = EventCounters_[event.Type];
-        counters.Count += 1;
-        counters.Size += event.Size;
-    }
-
-    virtual void ProcessEvent(const TTimingEvent& event) override
-    {
-        YTALLOC_LOG_DEBUG("Timing event logged (Type: %s, Duration: %s, Size: %zu, Timestamp: %s, FiberId: %" PRIu64 ")",
-            ToString(event.Type).c_str(),
-            ToString(event.Duration).c_str(),
-            event.Size,
-            ToString(event.Timestamp).c_str(),
-            event.FiberId);
-    }
-};
-
-Y_POD_THREAD(bool) TTimingManager::DisabledForCurrentThread_;
-
-TExplicitlyConstructableSingleton<TTimingManager> TimingManager;
-
-////////////////////////////////////////////////////////////////////////////////
-
-i64 GetElapsedNs(const struct timespec& startTime, const struct timespec& endTime)
-{
-    if (Y_LIKELY(startTime.tv_sec == endTime.tv_sec)) {
-        return static_cast<i64>(endTime.tv_nsec) - static_cast<i64>(startTime.tv_nsec);
-    }
-
-    return
-        static_cast<i64>(endTime.tv_nsec) - static_cast<i64>(startTime.tv_nsec) +
-        (static_cast<i64>(endTime.tv_sec) - static_cast<i64>(startTime.tv_sec)) * 1000000000;
-}
-
-// Used to log statistics about long-running syscalls and lock acquisitions.
-class TTimingGuard
-    : public TNonCopyable
-{
-public:
-    explicit TTimingGuard(ETimingEventType eventType, size_t size = 0)
-        : EventType_(eventType)
-        , Size_(size)
-    {
-        ::clock_gettime(CLOCK_MONOTONIC, &StartTime_);
-    }
-
-    ~TTimingGuard()
-    {
-        auto elapsedNs = GetElapsedNs();
-        if (elapsedNs > ConfigurationManager->GetTimingEventThresholdNs()) {
-            TimingManager->EnqueueEvent(EventType_, TDuration::MicroSeconds(elapsedNs / 1000), Size_);
-        }
-    }
-
-private:
-    const ETimingEventType EventType_;
-    const size_t Size_;
-    struct timespec StartTime_;
-
-    i64 GetElapsedNs() const
-    {
-        struct timespec endTime;
-        ::clock_gettime(CLOCK_MONOTONIC, &endTime);
-        return NYTAlloc::GetElapsedNs(StartTime_, endTime);
-    }
-};
-
-template <class T>
-Y_FORCE_INLINE TGuard<T> GuardWithTiming(const T& lock)
-{
-    TTimingGuard timingGuard(ETimingEventType::Locking);
-    TGuard<T> lockingGuard(lock);
-    return lockingGuard;
-}
-
-////////////////////////////////////////////////////////////////////////////////
-
-// A wrapper for mmap, mumap, and madvise calls.
-// The latter are invoked with MADV_POPULATE (if enabled) and MADV_FREE flags
-// and may fail if the OS support is missing. These failures are logged (once) and
-// handled as follows:
-// * if MADV_POPULATE fails then we fallback to manual per-page prefault
-// for all subsequent attempts;
-// * if MADV_FREE fails then it (and all subsequent attempts) is replaced with MADV_DONTNEED
-// (which is non-lazy and is less efficient but will somehow do).
-// Also this class mlocks all VMAs on startup to prevent pagefaults in our heavy binaries
-// from disturbing latency tails.
-class TMappedMemoryManager
-{
-public:
-    void* Map(uintptr_t hint, size_t size, int flags)
-    {
-        TTimingGuard timingGuard(ETimingEventType::Mmap, size);
-        auto* result = ::mmap(
-            reinterpret_cast<void*>(hint),
-            size,
-            PROT_READ | PROT_WRITE,
-            MAP_PRIVATE | MAP_ANONYMOUS | flags,
-            -1,
-            0);
-        if (result == MAP_FAILED) {
-            auto error = errno;
-            if (error == EEXIST && (flags & MAP_FIXED_NOREPLACE)) {
-                // Caller must retry with different hint address.
-                return result;
-            }
-            YTALLOC_VERIFY(error == ENOMEM);
-            ::fprintf(stderr, "*** YTAlloc has received ENOMEM error while trying to mmap %zu bytes\n",
-                size);
-            OomTrap();
-        }
-        return result;
-    }
-
-    void Unmap(void* ptr, size_t size)
-    {
-        TTimingGuard timingGuard(ETimingEventType::Munmap, size);
-        auto result = ::munmap(ptr, size);
-        YTALLOC_VERIFY(result == 0);
-    }
-
-    void DontDump(void* ptr, size_t size)
-    {
-        auto result = ::madvise(ptr, size, MADV_DONTDUMP);
-        // Must not fail.
-        YTALLOC_VERIFY(result == 0);
-    }
-
-    void PopulateFile(void* ptr, size_t size)
-    {
-        TTimingGuard timingGuard(ETimingEventType::FilePrefault, size);
-
-        auto* begin = static_cast<volatile char*>(ptr);
-        for (auto* current = begin; current < begin + size; current += PageSize) {
-            *current;
-        }
-    }
-
-    void PopulateReadOnly(void* ptr, size_t size)
-    {
-        if (!MadvisePopulateUnavailable_.load(std::memory_order_relaxed) &&
-            ConfigurationManager->GetEnableMadvisePopulate())
-        {
-            if (!TryMadvisePopulate(ptr, size)) {
-                MadvisePopulateUnavailable_.store(true);
-            }
-        }
-    }
-
-    void Populate(void* ptr, size_t size)
-    {
-        if (MadvisePopulateUnavailable_.load(std::memory_order_relaxed) ||
-            !ConfigurationManager->GetEnableMadvisePopulate())
-        {
-            DoPrefault(ptr, size);
-        } else if (!TryMadvisePopulate(ptr, size)) {
-            MadvisePopulateUnavailable_.store(true);
-            DoPrefault(ptr, size);
-        }
-    }
-
-    void Release(void* ptr, size_t size)
-    {
-        if (CanUseMadviseFree() && !ConfigurationManager->GetEnableEagerMemoryRelease()) {
-            DoMadviseFree(ptr, size);
-        } else {
-            DoMadviseDontNeed(ptr, size);
-        }
-    }
-
-    bool Stockpile(size_t size)
-    {
-        if (MadviseStockpileUnavailable_.load(std::memory_order_relaxed)) {
-            return false;
-        }
-        if (!TryMadviseStockpile(size)) {
-            MadviseStockpileUnavailable_.store(true);
-            return false;
-        }
-        return true;
-    }
-
-    void RunBackgroundTasks()
-    {
-        if (!LogManager->IsLoggingEnabled()) {
-            return;
-        }
-        if (IsBuggyKernel() && !BuggyKernelLogged_) {
-            YTALLOC_LOG_WARNING("Kernel is buggy; see KERNEL-118");
-            BuggyKernelLogged_ = true;
-        }
-        if (MadviseFreeSupported_ && !MadviseFreeSupportedLogged_) {
-            YTALLOC_LOG_INFO("MADV_FREE is supported");
-            MadviseFreeSupportedLogged_ = true;
-        }
-        if (MadviseFreeNotSupported_ && !MadviseFreeNotSupportedLogged_) {
-            YTALLOC_LOG_WARNING("MADV_FREE is not supported");
-            MadviseFreeNotSupportedLogged_ = true;
-        }
-        if (MadvisePopulateUnavailable_.load() && !MadvisePopulateUnavailableLogged_) {
-            YTALLOC_LOG_WARNING("MADV_POPULATE is not supported");
-            MadvisePopulateUnavailableLogged_ = true;
-        }
-        if (MadviseStockpileUnavailable_.load() && !MadviseStockpileUnavailableLogged_) {
-            YTALLOC_LOG_WARNING("MADV_STOCKPILE is not supported");
-            MadviseStockpileUnavailableLogged_ = true;
-        }
-    }
-
-private:
-    bool BuggyKernelLogged_ = false;
-
-    std::atomic<bool> MadviseFreeSupported_ = false;
-    bool MadviseFreeSupportedLogged_ = false;
-
-    std::atomic<bool> MadviseFreeNotSupported_ = false;
-    bool MadviseFreeNotSupportedLogged_ = false;
-
-    std::atomic<bool> MadvisePopulateUnavailable_ = false;
-    bool MadvisePopulateUnavailableLogged_ = false;
-
-    std::atomic<bool> MadviseStockpileUnavailable_ = false;
-    bool MadviseStockpileUnavailableLogged_ = false;
-
-private:
-    bool TryMadvisePopulate(void* ptr, size_t size)
-    {
-        TTimingGuard timingGuard(ETimingEventType::MadvisePopulate, size);
-        auto result = ::madvise(ptr, size, MADV_POPULATE);
-        if (result != 0) {
-            auto error = errno;
-            YTALLOC_VERIFY(error == EINVAL || error == ENOMEM);
-            if (error == ENOMEM) {
-                ::fprintf(stderr, "*** YTAlloc has received ENOMEM error while trying to madvise(MADV_POPULATE) %zu bytes\n",
-                    size);
-                OomTrap();
-            }
-            return false;
-        }
-        return true;
-    }
-
-    void DoPrefault(void* ptr, size_t size)
-    {
-        TTimingGuard timingGuard(ETimingEventType::Prefault, size);
-        auto* begin = static_cast<char*>(ptr);
-        for (auto* current = begin; current < begin + size; current += PageSize) {
-            *current = 0;
-        }
-    }
-
-    bool CanUseMadviseFree()
-    {
-        if (MadviseFreeSupported_.load()) {
-            return true;
-        }
-        if (MadviseFreeNotSupported_.load()) {
-            return false;
-        }
-
-        if (IsBuggyKernel()) {
-            MadviseFreeNotSupported_.store(true);
-        } else {
-            auto* ptr = Map(0, PageSize, 0);
-            if (::madvise(ptr, PageSize, MADV_FREE) == 0) {
-                MadviseFreeSupported_.store(true);
-            } else {
-                MadviseFreeNotSupported_.store(true);
-            }
-            Unmap(ptr, PageSize);
-        }
-
-        // Will not recurse.
-        return CanUseMadviseFree();
-    }
-
-    void DoMadviseDontNeed(void* ptr, size_t size)
-    {
-        TTimingGuard timingGuard(ETimingEventType::MadviseDontNeed, size);
-        auto result = ::madvise(ptr, size, MADV_DONTNEED);
-        if (result != 0) {
-            auto error = errno;
-            // Failure is possible for locked pages.
-            Y_ABORT_UNLESS(error == EINVAL);
-        }
-    }
-
-    void DoMadviseFree(void* ptr, size_t size)
-    {
-        TTimingGuard timingGuard(ETimingEventType::MadviseFree, size);
-        auto result = ::madvise(ptr, size, MADV_FREE);
-        if (result != 0) {
-            auto error = errno;
-            // Failure is possible for locked pages.
-            YTALLOC_VERIFY(error == EINVAL);
-        }
-    }
-
-    bool TryMadviseStockpile(size_t size)
-    {
-        auto result = ::madvise(nullptr, size, MADV_STOCKPILE);
-        if (result != 0) {
-            auto error = errno;
-            if (error == ENOMEM || error == EAGAIN || error == EINTR) {
-                // The call is advisory, ignore ENOMEM, EAGAIN, and EINTR.
-                return true;
-            }
-            YTALLOC_VERIFY(error == EINVAL);
-            return false;
-        }
-        return true;
-    }
-
-    // Some kernels are known to contain bugs in MADV_FREE; see https://st.yandex-team.ru/KERNEL-118.
-    bool IsBuggyKernel()
-    {
-#ifdef _linux_
-        static const bool result = [] () {
-            struct utsname buf;
-            YTALLOC_VERIFY(uname(&buf) == 0);
-            if (strverscmp(buf.release, "4.4.1-1") >= 0 &&
-                strverscmp(buf.release, "4.4.96-44") < 0)
-            {
-                return true;
-            }
-            if (strverscmp(buf.release, "4.14.1-1") >= 0 &&
-                strverscmp(buf.release, "4.14.79-33") < 0)
-            {
-                return true;
-            }
-            return false;
-        }();
-        return result;
-#else
-        return false;
-#endif
-    }
-};
-
-TExplicitlyConstructableSingleton<TMappedMemoryManager> MappedMemoryManager;
-
-////////////////////////////////////////////////////////////////////////////////
-// System allocator
-
-// Each system allocation is prepended with such a header.
-struct TSystemBlobHeader
-{
-    explicit TSystemBlobHeader(size_t size)
-        : Size(size)
-    { }
-
-    size_t Size;
-    char Padding[8];
-};
-
-CHECK_HEADER_ALIGNMENT(TSystemBlobHeader)
-
-// Used for some internal allocations.
-// Delgates directly to TMappedMemoryManager.
-class TSystemAllocator
-{
-public:
-    void* Allocate(size_t size);
-    void Free(void* ptr);
-
-private:
-    std::atomic<uintptr_t> CurrentPtr_ = SystemZoneStart;
-};
-
-TExplicitlyConstructableSingleton<TSystemAllocator> SystemAllocator;
-
-////////////////////////////////////////////////////////////////////////////////
-
-// Deriving from this class makes instances bound to TSystemAllocator.
-struct TSystemAllocatable
-{
-    void* operator new(size_t size) noexcept
-    {
-        return SystemAllocator->Allocate(size);
-    }
-
-    void* operator new[](size_t size) noexcept
-    {
-        return SystemAllocator->Allocate(size);
-    }
-
-    void operator delete(void* ptr) noexcept
-    {
-        SystemAllocator->Free(ptr);
-    }
-
-    void operator delete[](void* ptr) noexcept
-    {
-        SystemAllocator->Free(ptr);
-    }
-};
-
-////////////////////////////////////////////////////////////////////////////////
-
-// Maintains a pool of objects.
-// Objects are allocated in groups each containing BatchSize instances.
-// The actual allocation is carried out by TSystemAllocator.
-// Memory is never actually reclaimed; freed instances are put into TFreeList.
-template <class T, size_t BatchSize>
-class TSystemPool
-{
-public:
-    T* Allocate()
-    {
-        while (true) {
-            auto* obj = FreeList_.Extract();
-            if (Y_LIKELY(obj)) {
-                new (obj) T();
-                return obj;
-            }
-            AllocateMore();
-        }
-    }
-
-    void Free(T* obj)
-    {
-        obj->T::~T();
-        PoisonFreedRange(obj, sizeof(T));
-        FreeList_.Put(obj);
-    }
-
-private:
-    TFreeList<T> FreeList_;
-
-private:
-    void AllocateMore()
-    {
-        auto* objs = static_cast<T*>(SystemAllocator->Allocate(sizeof(T) * BatchSize));
-        for (size_t index = 0; index < BatchSize; ++index) {
-            auto* obj = objs + index;
-            FreeList_.Put(obj);
-        }
-    }
-};
-
-// A sharded analogue TSystemPool.
-template <class T, size_t BatchSize>
-class TShardedSystemPool
-{
-public:
-    template <class TState>
-    T* Allocate(TState* state)
-    {
-        if (auto* obj = FreeLists_[state->GetInitialShardIndex()].Extract()) {
-            new (obj) T();
-            return obj;
-        }
-
-        while (true) {
-            for (size_t index = 0; index < ShardCount; ++index) {
-                if (auto* obj = FreeLists_[state->GetNextShardIndex()].Extract()) {
-                    new (obj) T();
-                    return obj;
-                }
-            }
-            AllocateMore();
-        }
-    }
-
-    template <class TState>
-    void Free(TState* state, T* obj)
-    {
-        obj->T::~T();
-        PoisonFreedRange(obj, sizeof(T));
-        FreeLists_[state->GetInitialShardIndex()].Put(obj);
-    }
-
-private:
-    std::array<TFreeList<T>, ShardCount> FreeLists_;
-
-private:
-    void AllocateMore()
-    {
-        auto* objs = static_cast<T*>(SystemAllocator->Allocate(sizeof(T) * BatchSize));
-        for (size_t index = 0; index < BatchSize; ++index) {
-            auto* obj = objs + index;
-            FreeLists_[index % ShardCount].Put(obj);
-        }
-    }
-};
-
-////////////////////////////////////////////////////////////////////////////////
-
-// Handles allocations inside a zone of memory given by its start and end pointers.
-// Each allocation is a separate mapped region of memory.
-// A special care is taken to guarantee that all allocated regions fall inside the zone.
-class TZoneAllocator
-{
-public:
-    TZoneAllocator(uintptr_t zoneStart, uintptr_t zoneEnd)
-        : ZoneStart_(zoneStart)
-        , ZoneEnd_(zoneEnd)
-        , Current_(zoneStart)
-    {
-        YTALLOC_VERIFY(ZoneStart_ % PageSize == 0);
-    }
-
-    void* Allocate(size_t size, int flags)
-    {
-        YTALLOC_VERIFY(size % PageSize == 0);
-        bool restarted = false;
-        while (true) {
-            auto hint = (Current_ += size) - size;
-            if (reinterpret_cast<uintptr_t>(hint) + size > ZoneEnd_) {
-                if (restarted) {
-                    ::fprintf(stderr, "*** YTAlloc was unable to mmap %zu bytes in zone %" PRIx64 "--%" PRIx64 "\n",
-                        size,
-                        ZoneStart_,
-                        ZoneEnd_);
-                    OomTrap();
-                }
-                restarted = true;
-                Current_ = ZoneStart_;
-            } else {
-                char* ptr = static_cast<char*>(MappedMemoryManager->Map(
-                    hint,
-                    size,
-                    MAP_FIXED_NOREPLACE | flags));
-                if (reinterpret_cast<uintptr_t>(ptr) == hint) {
-                    return ptr;
-                }
-                if (ptr != MAP_FAILED) {
-                    MappedMemoryManager->Unmap(ptr, size);
-                }
-            }
-        }
-    }
-
-    void Free(void* ptr, size_t size)
-    {
-        MappedMemoryManager->Unmap(ptr, size);
-    }
-
-private:
-    const uintptr_t ZoneStart_;
-    const uintptr_t ZoneEnd_;
-
-    std::atomic<uintptr_t> Current_;
-};
-
-////////////////////////////////////////////////////////////////////////////////
-
-// YTAlloc supports tagged allocations.
-// Since the total number of tags can be huge, a two-level scheme is employed.
-// Possible tags are arranged into sets each containing TaggedCounterSetSize tags.
-// There are up to MaxTaggedCounterSets in total.
-// Upper 4 sets are reserved for profiled allocations.
-constexpr size_t TaggedCounterSetSize = 16384;
-constexpr size_t AllocationProfilingTaggedCounterSets = 4;
-constexpr size_t MaxTaggedCounterSets = 256 + AllocationProfilingTaggedCounterSets;
-
-constexpr size_t MaxCapturedAllocationBacktraces = 65000;
-static_assert(
-    MaxCapturedAllocationBacktraces < AllocationProfilingTaggedCounterSets * TaggedCounterSetSize,
-    "MaxCapturedAllocationBacktraces is too big");
-
-constexpr TMemoryTag AllocationProfilingMemoryTagBase = TaggedCounterSetSize * (MaxTaggedCounterSets - AllocationProfilingTaggedCounterSets);
-constexpr TMemoryTag AllocationProfilingUnknownMemoryTag = AllocationProfilingMemoryTagBase + MaxCapturedAllocationBacktraces;
-
-static_assert(
-    MaxMemoryTag == TaggedCounterSetSize * (MaxTaggedCounterSets - AllocationProfilingTaggedCounterSets) - 1,
-    "Wrong MaxMemoryTag");
-
-template <class TCounter>
-using TUntaggedTotalCounters = TEnumIndexedArray<EBasicCounter, TCounter>;
-
-template <class TCounter>
-struct TTaggedTotalCounterSet
-    : public TSystemAllocatable
-{
-    std::array<TEnumIndexedArray<EBasicCounter, TCounter>, TaggedCounterSetSize> Counters;
-};
-
-using TLocalTaggedBasicCounterSet = TTaggedTotalCounterSet<ssize_t>;
-using TGlobalTaggedBasicCounterSet = TTaggedTotalCounterSet<std::atomic<ssize_t>>;
-
-template <class TCounter>
-struct TTotalCounters
-{
-    // The sum of counters across all tags.
-    TUntaggedTotalCounters<TCounter> CumulativeTaggedCounters;
-
-    // Counters for untagged allocations.
-    TUntaggedTotalCounters<TCounter> UntaggedCounters;
-
-    // Access to tagged counters may involve creation of a new tag set.
-    // For simplicity, we separate the read-side (TaggedCounterSets) and the write-side (TaggedCounterSetHolders).
-    // These arrays contain virtually identical data (up to std::unique_ptr and std::atomic semantic differences).
-    std::array<std::atomic<TTaggedTotalCounterSet<TCounter>*>, MaxTaggedCounterSets> TaggedCounterSets{};
-    std::array<std::unique_ptr<TTaggedTotalCounterSet<TCounter>>, MaxTaggedCounterSets> TaggedCounterSetHolders;
-
-    // Protects TaggedCounterSetHolders from concurrent updates.
-    NThreading::TForkAwareSpinLock TaggedCounterSetsLock;
-
-    // Returns null if the set is not yet constructed.
-    Y_FORCE_INLINE TTaggedTotalCounterSet<TCounter>* FindTaggedCounterSet(size_t index) const
-    {
-        return TaggedCounterSets[index].load();
-    }
-
-    // Constructs the set on first access.
-    TTaggedTotalCounterSet<TCounter>* GetOrCreateTaggedCounterSet(size_t index)
-    {
-        auto* set = TaggedCounterSets[index].load();
-        if (Y_LIKELY(set)) {
-            return set;
-        }
-
-        auto guard = GuardWithTiming(TaggedCounterSetsLock);
-        auto& setHolder = TaggedCounterSetHolders[index];
-        if (!setHolder) {
-            setHolder = std::make_unique<TTaggedTotalCounterSet<TCounter>>();
-            TaggedCounterSets[index] = setHolder.get();
-        }
-        return setHolder.get();
-    }
-};
-
-using TLocalSystemCounters = TEnumIndexedArray<ESystemCounter, ssize_t>;
-using TGlobalSystemCounters = TEnumIndexedArray<ESystemCounter, std::atomic<ssize_t>>;
-
-using TLocalSmallCounters = TEnumIndexedArray<ESmallArenaCounter, ssize_t>;
-using TGlobalSmallCounters = TEnumIndexedArray<ESmallArenaCounter, std::atomic<ssize_t>>;
-
-using TLocalLargeCounters = TEnumIndexedArray<ELargeArenaCounter, ssize_t>;
-using TGlobalLargeCounters = TEnumIndexedArray<ELargeArenaCounter, std::atomic<ssize_t>>;
-
-using TLocalHugeCounters = TEnumIndexedArray<EHugeCounter, ssize_t>;
-using TGlobalHugeCounters = TEnumIndexedArray<EHugeCounter, std::atomic<ssize_t>>;
-
-using TLocalUndumpableCounters = TEnumIndexedArray<EUndumpableCounter, ssize_t>;
-using TGlobalUndumpableCounters = TEnumIndexedArray<EUndumpableCounter, std::atomic<ssize_t>>;
-
-Y_FORCE_INLINE ssize_t LoadCounter(ssize_t counter)
-{
-    return counter;
-}
-
-Y_FORCE_INLINE ssize_t LoadCounter(const std::atomic<ssize_t>& counter)
-{
-    return counter.load();
-}
-
-////////////////////////////////////////////////////////////////////////////////
-
-struct TMmapObservationEvent
-{
-    size_t Size;
-    std::array<void*, MaxAllocationProfilingBacktraceDepth> Frames;
-    int FrameCount;
-    TInstant Timestamp;
-    TFiberId FiberId;
-
-    TMmapObservationEvent() = default;
-
-    TMmapObservationEvent(
-        size_t size,
-        std::array<void*, MaxAllocationProfilingBacktraceDepth> frames,
-        int frameCount)
-        : Size(size)
-        , Frames(frames)
-        , FrameCount(frameCount)
-    { }
-};
-
-class TMmapObservationManager
-    : public TEventLogManagerBase<TMmapObservationEvent, TMmapObservationManager>
-{
-public:
-    void SetBacktraceFormatter(TBacktraceFormatter formatter)
-    {
-        BacktraceFormatter_.store(formatter);
-    }
-
-private:
-    std::atomic<TBacktraceFormatter> BacktraceFormatter_ = nullptr;
-
-    Y_POD_STATIC_THREAD(bool) DisabledForCurrentThread_;
-
-    friend class TEventLogManagerBase<TMmapObservationEvent, TMmapObservationManager>;
-
-    virtual void OnEvent(const TMmapObservationEvent& /*event*/) override
-    { }
-
-    virtual void ProcessEvent(const TMmapObservationEvent& event) override
-    {
-        YTALLOC_LOG_DEBUG("Large arena mmap observed (Size: %zu, Timestamp: %s, FiberId: %" PRIx64 ")",
-            event.Size,
-            ToString(event.Timestamp).c_str(),
-            event.FiberId);
-
-        if (auto backtraceFormatter = BacktraceFormatter_.load()) {
-            auto backtrace = backtraceFormatter(const_cast<void**>(event.Frames.data()), event.FrameCount);
-            YTALLOC_LOG_DEBUG("YTAlloc stack backtrace (Stack: %s)",
-                backtrace.c_str());
-        }
-    }
-};
-
-Y_POD_THREAD(bool) TMmapObservationManager::DisabledForCurrentThread_;
-
-TExplicitlyConstructableSingleton<TMmapObservationManager> MmapObservationManager;
-
-////////////////////////////////////////////////////////////////////////////////
-
-// A per-thread structure containing counters, chunk caches etc.
-struct TThreadState
-    : public TFreeListItemBase<TThreadState>
-    , public TLocalShardedState
-{
-    // TThreadState instances of all alive threads are put into a double-linked intrusive list.
-    // This is a pair of next/prev pointers connecting an instance of TThreadState to its neighbors.
-    TIntrusiveLinkedListNode<TThreadState> RegistryNode;
-
-    // Pointers to the respective parts of TThreadManager::ThreadControlWord_.
-    // If null then the thread is already destroyed (but TThreadState may still live for a while
-    // due to ref-counting).
-    ui8* AllocationProfilingEnabled;
-    ui8* BackgroundThreadStarted;
-
-    // TThreadStates are ref-counted.
-    // TThreadManager::EnumerateThreadStates enumerates the registered states and acquires
-    // a temporary reference preventing these states from being destructed. This provides
-    // for shorter periods of time the global lock needs to be held.
-    int RefCounter = 1;
-
-    // Per-thread counters.
-    TTotalCounters<ssize_t> TotalCounters;
-    std::array<TLocalLargeCounters, LargeRankCount> LargeArenaCounters;
-    TLocalUndumpableCounters UndumpableCounters;
-
-    // Each thread maintains caches of small chunks.
-    // One cache is for tagged chunks; the other is for untagged ones.
-    // Each cache contains up to MaxCachedChunksPerRank chunks per any rank.
-    // Special sentinels are placed to distinguish the boundaries of region containing
-    // pointers of a specific rank. This enables a tiny-bit faster inplace boundary checks.
-
-    static constexpr uintptr_t LeftSentinel = 1;
-    static constexpr uintptr_t RightSentinel = 2;
-
-    struct TSmallBlobCache
-    {
-        TSmallBlobCache()
-        {
-            void** chunkPtrs = CachedChunks.data();
-            for (size_t rank = 0; rank < SmallRankCount; ++rank) {
-                RankToCachedChunkPtrHead[rank] = chunkPtrs;
-                chunkPtrs[0] = reinterpret_cast<void*>(LeftSentinel);
-                chunkPtrs[MaxCachedChunksPerRank + 1] = reinterpret_cast<void*>(RightSentinel);
-
-#ifdef YTALLOC_PARANOID
-                RankToCachedChunkPtrTail[rank] = chunkPtrs;
-                CachedChunkFull[rank] = false;
-
-                RankToCachedChunkLeftBorder[rank] = chunkPtrs;
-                RankToCachedChunkRightBorder[rank] = chunkPtrs + MaxCachedChunksPerRank + 1;
-#endif
-                chunkPtrs += MaxCachedChunksPerRank + 2;
-            }
-        }
-
-        // For each rank we have a segment of pointers in CachedChunks with the following layout:
-        //   LCC[C]........R
-        // Legend:
-        //   .  = garbage
-        //   L  = left sentinel
-        //   R  = right sentinel
-        //   C  = cached pointer
-        //  [C] = current cached pointer
-        //
-        // Under YTALLOC_PARANOID the following layout is used:
-        //   L.[T]CCC[H]...R
-        // Legend:
-        //   [H] = head cached pointer, put chunks here
-        //   [T] = tail cached pointer, take chunks from here
-
-        //  +2 is for two sentinels
-        std::array<void*, SmallRankCount * (MaxCachedChunksPerRank + 2)> CachedChunks{};
-
-        // Pointer to [P] for each rank.
-        std::array<void**, SmallRankCount> RankToCachedChunkPtrHead{};
-
-#ifdef YTALLOC_PARANOID
-        // Pointers to [L] and [R] for each rank.
-        std::array<void**, SmallRankCount> RankToCachedChunkLeftBorder{};
-        std::array<void**, SmallRankCount> RankToCachedChunkRightBorder{};
-
-        std::array<void**, SmallRankCount> RankToCachedChunkPtrTail{};
-        std::array<bool, SmallRankCount> CachedChunkFull{};
-#endif
-    };
-    TEnumIndexedArray<EAllocationKind, TSmallBlobCache> SmallBlobCache;
-};
-
-struct TThreadStateToRegistryNode
-{
-    auto operator() (TThreadState* state) const
-    {
-        return &state->RegistryNode;
-    }
-};
-
-// Manages all registered threads and controls access to TThreadState.
-class TThreadManager
-{
-public:
-    TThreadManager()
-    {
-        pthread_key_create(&ThreadDtorKey_, DestroyThread);
-
-        NThreading::RegisterAtForkHandlers(
-            nullptr,
-            nullptr,
-            [=] { AfterFork(); });
-    }
-
-    // Returns TThreadState for the current thread; the caller guarantees that this
-    // state is initialized and is not destroyed yet.
-    static TThreadState* GetThreadStateUnchecked();
-
-    // Returns TThreadState for the current thread; may return null.
-    static TThreadState* FindThreadState();
-
-    // Returns TThreadState for the current thread; may not return null
-    // (but may crash if TThreadState is already destroyed).
-    static TThreadState* GetThreadStateChecked()
-    {
-        auto* state = FindThreadState();
-        YTALLOC_VERIFY(state);
-        return state;
-    }
-
-    // Enumerates all threads and invokes func passing TThreadState instances.
-    // func must not throw but can take arbitrary time; no locks are being held while it executes.
-    template <class THandler>
-    void EnumerateThreadStatesAsync(const THandler& handler) noexcept
-    {
-        TMemoryTagGuard guard(NullMemoryTag);
-
-        std::vector<TThreadState*> states;
-        states.reserve(1024); // must be enough in most cases
-
-        auto unrefStates = [&] {
-            // Releasing references also requires global lock to be held to avoid getting zombies above.
-            auto guard = GuardWithTiming(ThreadRegistryLock_);
-            for (auto* state : states) {
-                UnrefThreadState(state);
-            }
-        };
-
-        auto tryRefStates = [&] {
-            // Only hold this guard for a small period of time to reference all the states.
-            auto guard = GuardWithTiming(ThreadRegistryLock_);
-            auto* current = ThreadRegistry_.GetFront();
-            while (current) {
-                if (states.size() == states.capacity()) {
-                    // Cannot allocate while holding ThreadRegistryLock_ due to a possible deadlock as follows:
-                    // EnumerateThreadStatesAsync -> StartBackgroundThread -> EnumerateThreadStatesSync
-                    // (many other scenarios are also possible).
-                    guard.Release();
-                    unrefStates();
-                    states.clear();
-                    states.reserve(states.capacity() * 2);
-                    return false;
-                }
-                RefThreadState(current);
-                states.push_back(current);
-                current = current->RegistryNode.Next;
-            }
-            return true;
-        };
-
-        while (!tryRefStates()) ;
-
-        for (auto* state : states) {
-            handler(state);
-        }
-
-        unrefStates();
-    }
-
-    // Similar to EnumerateThreadStatesAsync but holds the global lock while enumerating the threads.
-    // Also invokes a given prologue functor while holding the thread registry lock.
-    // Handler and prologue calls must be fast and must not allocate.
-    template <class TPrologue, class THandler>
-    void EnumerateThreadStatesSync(const TPrologue& prologue, const THandler& handler) noexcept
-    {
-        auto guard = GuardWithTiming(ThreadRegistryLock_);
-        prologue();
-        auto* current = ThreadRegistry_.GetFront();
-        while (current) {
-            handler(current);
-            current = current->RegistryNode.Next;
-        }
-    }
-
-
-    // We store a special 64-bit "thread control word" in TLS encapsulating the following
-    // crucial per-thread parameters:
-    // * the current memory tag
-    // * a flag indicating that a valid TThreadState is known to exists
-    // (and can be obtained via GetThreadStateUnchecked)
-    // * a flag indicating that allocation profiling is enabled
-    // * a flag indicating that background thread is started
-    // Thread control word is fetched via GetThreadControlWord and is compared
-    // against FastPathControlWord to see if the fast path can be taken.
-    // The latter happens when no memory tagging is configured, TThreadState is
-    // valid, allocation profiling is disabled, and background thread is started.
-
-    // The mask for extracting memory tag from thread control word.
-    static constexpr ui64 MemoryTagControlWordMask = 0xffffffff;
-    // ThreadStateValid is on.
-    static constexpr ui64 ThreadStateValidControlWordMask = (1ULL << 32);
-    // AllocationProfiling is on.
-    static constexpr ui64 AllocationProfilingEnabledControlWordMask = (1ULL << 40);
-    // All background thread are properly started.
-    static constexpr ui64 BackgroundThreadStartedControlWorkMask = (1ULL << 48);
-    // Memory tag is NullMemoryTag; thread state is valid.
-    static constexpr ui64 FastPathControlWord =
-        BackgroundThreadStartedControlWorkMask |
-        ThreadStateValidControlWordMask |
-        NullMemoryTag;
-
-    Y_FORCE_INLINE static ui64 GetThreadControlWord()
-    {
-        return (&ThreadControlWord_)->Value;
-    }
-
-
-    static TMemoryTag GetCurrentMemoryTag()
-    {
-        return (&ThreadControlWord_)->Parts.MemoryTag;
-    }
-
-    static void SetCurrentMemoryTag(TMemoryTag tag)
-    {
-        Y_ABORT_UNLESS(tag <= MaxMemoryTag);
-        (&ThreadControlWord_)->Parts.MemoryTag = tag;
-    }
-
-
-    static EMemoryZone GetCurrentMemoryZone()
-    {
-        return CurrentMemoryZone_;
-    }
-
-    static void SetCurrentMemoryZone(EMemoryZone zone)
-    {
-        CurrentMemoryZone_ = zone;
-    }
-
-
-    static void SetCurrentFiberId(TFiberId id)
-    {
-        CurrentFiberId_ = id;
-    }
-
-    static TFiberId GetCurrentFiberId()
-    {
-        return CurrentFiberId_;
-    }
-
-private:
-    static void DestroyThread(void*);
-
-    TThreadState* AllocateThreadState();
-
-    void RefThreadState(TThreadState* state)
-    {
-        auto result = ++state->RefCounter;
-        Y_ABORT_UNLESS(result > 1);
-    }
-
-    void UnrefThreadState(TThreadState* state)
-    {
-        auto result = --state->RefCounter;
-        Y_ABORT_UNLESS(result >= 0);
-        if (result == 0) {
-            DestroyThreadState(state);
-        }
-    }
-
-    void DestroyThreadState(TThreadState* state);
-
-    void AfterFork();
-
-private:
-    // TThreadState instance for the current thread.
-    // Initially null, then initialized when first needed.
-    // TThreadState is destroyed upon thread termination (which is detected with
-    // the help of pthread_key_create machinery), so this pointer can become null again.
-    Y_POD_STATIC_THREAD(TThreadState*) ThreadState_;
-
-    // Initially false, then set to true then TThreadState is destroyed.
-    // If the thread requests for its state afterwards, null is returned and no new state is (re-)created.
-    // The caller must be able to deal with it.
-    Y_POD_STATIC_THREAD(bool) ThreadStateDestroyed_;
-
-    union TThreadControlWord
-    {
-        ui64 __attribute__((__may_alias__)) Value;
-        struct TParts
-        {
-            // The current memory tag used in all allocations by this thread.
-            ui32 __attribute__((__may_alias__)) MemoryTag;
-            // Indicates if a valid TThreadState exists and can be obtained via GetThreadStateUnchecked.
-            ui8 __attribute__((__may_alias__)) ThreadStateValid;
-            // Indicates if allocation profiling is on.
-            ui8 __attribute__((__may_alias__)) AllocationProfilingEnabled;
-            // Indicates if all background threads are properly started.
-            ui8 __attribute__((__may_alias__)) BackgroundThreadStarted;
-            ui8 Padding[2];
-        } Parts;
-    };
-    Y_POD_STATIC_THREAD(TThreadControlWord) ThreadControlWord_;
-
-    // See memory zone API.
-    Y_POD_STATIC_THREAD(EMemoryZone) CurrentMemoryZone_;
-
-    // See fiber id API.
-    Y_POD_STATIC_THREAD(TFiberId) CurrentFiberId_;
-
-    pthread_key_t ThreadDtorKey_;
-
-    static constexpr size_t ThreadStatesBatchSize = 1;
-    TSystemPool<TThreadState, ThreadStatesBatchSize> ThreadStatePool_;
-
-    NThreading::TForkAwareSpinLock ThreadRegistryLock_;
-    TIntrusiveLinkedList<TThreadState, TThreadStateToRegistryNode> ThreadRegistry_;
-};
-
-Y_POD_THREAD(TThreadState*) TThreadManager::ThreadState_;
-Y_POD_THREAD(bool) TThreadManager::ThreadStateDestroyed_;
-Y_POD_THREAD(TThreadManager::TThreadControlWord) TThreadManager::ThreadControlWord_;
-Y_POD_THREAD(EMemoryZone) TThreadManager::CurrentMemoryZone_;
-Y_POD_THREAD(TFiberId) TThreadManager::CurrentFiberId_;
-
-TExplicitlyConstructableSingleton<TThreadManager> ThreadManager;
-
-////////////////////////////////////////////////////////////////////////////////
-
-void TConfigurationManager::SetAllocationProfilingEnabled(bool value)
-{
-    // Update threads' TLS.
-    ThreadManager->EnumerateThreadStatesSync(
-        [&] {
-            AllocationProfilingEnabled_.store(value);
-        },
-        [&] (auto* state) {
-            if (state->AllocationProfilingEnabled) {
-                *state->AllocationProfilingEnabled = value;
-            }
-        });
-}
-
-////////////////////////////////////////////////////////////////////////////////
-// Backtrace Manager
-//
-// Captures backtraces observed during allocations and assigns memory tags to them.
-// Memory tags are chosen sequentially starting from AllocationProfilingMemoryTagBase.
-//
-// For each backtrace we compute a 64-bit hash and use it as a key in a certain concurrent hashmap.
-// This hashmap is organized into BucketCount buckets, each consisting of BucketSize slots.
-//
-// Backtrace hash is translated into bucket index by taking the appropriate number of
-// its lower bits. For each slot, we remember a 32-bit fingerprint, which is
-// just the next 32 bits of the backtrace's hash, and the (previously assigned) memory tag.
-//
-// Upon access to the hashtable, the bucket is first scanned optimistically, without taking
-// any locks. In case of a miss, a per-bucket spinlock is acquired and the bucket is rescanned.
-//
-// The above scheme may involve collisions but we neglect their probability.
-//
-// If the whole hash table overflows (i.e. a total of MaxCapturedAllocationBacktraces
-// backtraces are captured) or the bucket overflows (i.e. all of its slots become occupied),
-// the allocation is annotated with AllocationProfilingUnknownMemoryTag. Such allocations
-// appear as having no backtrace whatsoever in the profiling reports.
-
-class TBacktraceManager
-{
-public:
-    // Sets the provider used for collecting backtraces when allocation profiling
-    // is turned ON.
-    void SetBacktraceProvider(TBacktraceProvider provider)
-    {
-        BacktraceProvider_.store(provider);
-    }
-
-    // Captures the backtrace and inserts it into the hashtable.
-    TMemoryTag GetMemoryTagFromBacktrace(int framesToSkip)
-    {
-        std::array<void*, MaxAllocationProfilingBacktraceDepth> frames;
-        auto backtraceProvider = BacktraceProvider_.load();
-        if (!backtraceProvider) {
-            return NullMemoryTag;
-        }
-        auto frameCount  = backtraceProvider(frames.data(), ConfigurationManager->GetProfilingBacktraceDepth(), framesToSkip);
-        auto hash = GetBacktraceHash(frames.data(), frameCount);
-        return CaptureBacktrace(hash, frames.data(), frameCount);
-    }
-
-    // Returns the backtrace corresponding to the given tag, if any.
-    std::optional<TBacktrace> FindBacktrace(TMemoryTag tag)
-    {
-        if (tag < AllocationProfilingMemoryTagBase ||
-            tag >= AllocationProfilingMemoryTagBase + MaxCapturedAllocationBacktraces)
-        {
-            return std::nullopt;
-        }
-        const auto& entry = Backtraces_[tag - AllocationProfilingMemoryTagBase];
-        if (!entry.Captured.load()) {
-            return std::nullopt;
-        }
-        return entry.Backtrace;
-    }
-
-private:
-    static constexpr int Log2BucketCount = 16;
-    static constexpr int BucketCount = 1 << Log2BucketCount;
-    static constexpr int BucketSize = 8;
-
-    std::atomic<TBacktraceProvider> BacktraceProvider_ = nullptr;
-
-    std::array<std::array<std::atomic<ui32>, BucketSize>, BucketCount> Fingerprints_= {};
-    std::array<std::array<std::atomic<TMemoryTag>, BucketSize>, BucketCount> MemoryTags_ = {};
-    std::array<NThreading::TForkAwareSpinLock, BucketCount> BucketLocks_;
-    std::atomic<TMemoryTag> CurrentMemoryTag_ = AllocationProfilingMemoryTagBase;
-
-    struct TBacktraceEntry
-    {
-        TBacktrace Backtrace;
-        std::atomic<bool> Captured = false;
-    };
-
-    std::array<TBacktraceEntry, MaxCapturedAllocationBacktraces> Backtraces_;
-
-private:
-    static size_t GetBacktraceHash(void** frames, int frameCount)
-    {
-        size_t hash = 0;
-        for (int index = 0; index < frameCount; ++index) {
-            hash = CombineHashes(hash, THash<void*>()(frames[index]));
-        }
-        return hash;
-    }
-
-    TMemoryTag CaptureBacktrace(size_t hash, void** frames, int frameCount)
-    {
-        size_t bucketIndex = hash % BucketCount;
-        ui32 fingerprint = (hash >> Log2BucketCount) & 0xffffffff;
-        // Zero fingerprint indicates the slot is free; check and adjust to ensure
-        // that regular fingerprints are non-zero.
-        if (fingerprint == 0) {
-            fingerprint = 1;
-        }
-
-        for (int slotIndex = 0; slotIndex < BucketSize; ++slotIndex) {
-            auto currentFingerprint = Fingerprints_[bucketIndex][slotIndex].load(std::memory_order_relaxed);
-            if (currentFingerprint == fingerprint) {
-                return MemoryTags_[bucketIndex][slotIndex].load();
-            }
-        }
-
-        auto guard = Guard(BucketLocks_[bucketIndex]);
-
-        int spareSlotIndex = -1;
-        for (int slotIndex = 0; slotIndex < BucketSize; ++slotIndex) {
-            auto currentFingerprint = Fingerprints_[bucketIndex][slotIndex].load(std::memory_order_relaxed);
-            if (currentFingerprint == fingerprint) {
-                return MemoryTags_[bucketIndex][slotIndex];
-            }
-            if (currentFingerprint == 0) {
-                spareSlotIndex = slotIndex;
-                break;
-            }
-        }
-
-        if (spareSlotIndex < 0) {
-            return AllocationProfilingUnknownMemoryTag;
-        }
-
-        auto memoryTag = CurrentMemoryTag_++;
-        if (memoryTag >= AllocationProfilingMemoryTagBase + MaxCapturedAllocationBacktraces) {
-            return AllocationProfilingUnknownMemoryTag;
-        }
-
-        MemoryTags_[bucketIndex][spareSlotIndex].store(memoryTag);
-        Fingerprints_[bucketIndex][spareSlotIndex].store(fingerprint);
-
-        auto& entry = Backtraces_[memoryTag - AllocationProfilingMemoryTagBase];
-        entry.Backtrace.FrameCount = frameCount;
-        ::memcpy(entry.Backtrace.Frames.data(), frames, sizeof (void*) * frameCount);
-        entry.Captured.store(true);
-
-        return memoryTag;
-    }
-};
-
-TExplicitlyConstructableSingleton<TBacktraceManager> BacktraceManager;
-
-////////////////////////////////////////////////////////////////////////////////
-
-// Mimics the counters of TThreadState but uses std::atomic to survive concurrent access.
-struct TGlobalState
-    : public TGlobalShardedState
-{
-    TTotalCounters<std::atomic<ssize_t>> TotalCounters;
-    std::array<TGlobalLargeCounters, LargeRankCount> LargeArenaCounters;
-    TGlobalUndumpableCounters UndumpableCounters;
-};
-
-TExplicitlyConstructableSingleton<TGlobalState> GlobalState;
-
-////////////////////////////////////////////////////////////////////////////////
-
-// Accumulates various allocation statistics.
-class TStatisticsManager
-{
-public:
-    template <EAllocationKind Kind = EAllocationKind::Tagged, class TState>
-    static Y_FORCE_INLINE void IncrementTotalCounter(TState* state, TMemoryTag tag, EBasicCounter counter, ssize_t delta)
-    {
-        // This branch is typically resolved at compile time.
-        if (Kind == EAllocationKind::Tagged && tag != NullMemoryTag) {
-            IncrementTaggedTotalCounter(&state->TotalCounters, tag, counter, delta);
-        } else {
-            IncrementUntaggedTotalCounter(&state->TotalCounters, counter, delta);
-        }
-    }
-
-    static Y_FORCE_INLINE void IncrementTotalCounter(TMemoryTag tag, EBasicCounter counter, ssize_t delta)
-    {
-        IncrementTotalCounter(GlobalState.Get(), tag, counter, delta);
-    }
-
-    void IncrementSmallArenaCounter(ESmallArenaCounter counter, size_t rank, ssize_t delta)
-    {
-        SmallArenaCounters_[rank][counter] += delta;
-    }
-
-    template <class TState>
-    static Y_FORCE_INLINE void IncrementLargeArenaCounter(TState* state, size_t rank, ELargeArenaCounter counter, ssize_t delta)
-    {
-        state->LargeArenaCounters[rank][counter] += delta;
-    }
-
-    template <class TState>
-    static Y_FORCE_INLINE void IncrementUndumpableCounter(TState* state, EUndumpableCounter counter, ssize_t delta)
-    {
-        state->UndumpableCounters[counter] += delta;
-    }
-
-    void IncrementHugeCounter(EHugeCounter counter, ssize_t delta)
-    {
-        HugeCounters_[counter] += delta;
-    }
-
-    void IncrementHugeUndumpableCounter(EUndumpableCounter counter, ssize_t delta)
-    {
-        HugeUndumpableCounters_[counter] += delta;
-    }
-
-    void IncrementSystemCounter(ESystemCounter counter, ssize_t delta)
-    {
-        SystemCounters_[counter] += delta;
-    }
-
-    // Computes memory usage for a list of tags by aggregating counters across threads.
-    void GetTaggedMemoryCounters(const TMemoryTag* tags, size_t count, TEnumIndexedArray<EBasicCounter, ssize_t>* counters)
-    {
-        TMemoryTagGuard guard(NullMemoryTag);
-
-        for (size_t index = 0; index < count; ++index) {
-            counters[index][EBasicCounter::BytesAllocated] = 0;
-            counters[index][EBasicCounter::BytesFreed] = 0;
-        }
-
-        for (size_t index = 0; index < count; ++index) {
-            auto tag = tags[index];
-            counters[index][EBasicCounter::BytesAllocated] += LoadTaggedTotalCounter(GlobalState->TotalCounters, tag, EBasicCounter::BytesAllocated);
-            counters[index][EBasicCounter::BytesFreed] += LoadTaggedTotalCounter(GlobalState->TotalCounters, tag, EBasicCounter::BytesFreed);
-        }
-
-        ThreadManager->EnumerateThreadStatesAsync(
-            [&] (const auto* state) {
-                for (size_t index = 0; index < count; ++index) {
-                    auto tag = tags[index];
-                    counters[index][EBasicCounter::BytesAllocated] += LoadTaggedTotalCounter(state->TotalCounters, tag, EBasicCounter::BytesAllocated);
-                    counters[index][EBasicCounter::BytesFreed] += LoadTaggedTotalCounter(state->TotalCounters, tag, EBasicCounter::BytesFreed);
-                }
-            });
-
-        for (size_t index = 0; index < count; ++index) {
-            counters[index][EBasicCounter::BytesUsed] = GetUsed(counters[index][EBasicCounter::BytesAllocated], counters[index][EBasicCounter::BytesFreed]);
-        }
-    }
-
-    void GetTaggedMemoryUsage(const TMemoryTag* tags, size_t count, size_t* results)
-    {
-        TMemoryTagGuard guard(NullMemoryTag);
-
-        std::vector<TEnumIndexedArray<EBasicCounter, ssize_t>> counters;
-        counters.resize(count);
-        GetTaggedMemoryCounters(tags, count, counters.data());
-
-        for (size_t index = 0; index < count; ++index) {
-            results[index] = counters[index][EBasicCounter::BytesUsed];
-        }
-    }
-
-    TEnumIndexedArray<ETotalCounter, ssize_t> GetTotalAllocationCounters()
-    {
-        TEnumIndexedArray<ETotalCounter, ssize_t> result;
-
-        auto accumulate = [&] (const auto& counters) {
-            result[ETotalCounter::BytesAllocated] += LoadCounter(counters[EBasicCounter::BytesAllocated]);
-            result[ETotalCounter::BytesFreed] += LoadCounter(counters[EBasicCounter::BytesFreed]);
-        };
-
-        accumulate(GlobalState->TotalCounters.UntaggedCounters);
-        accumulate(GlobalState->TotalCounters.CumulativeTaggedCounters);
-
-        ThreadManager->EnumerateThreadStatesAsync(
-            [&] (const auto* state) {
-                accumulate(state->TotalCounters.UntaggedCounters);
-                accumulate(state->TotalCounters.CumulativeTaggedCounters);
-            });
-
-        result[ETotalCounter::BytesUsed] = GetUsed(
-            result[ETotalCounter::BytesAllocated],
-            result[ETotalCounter::BytesFreed]);
-
-        auto systemCounters = GetSystemAllocationCounters();
-        result[ETotalCounter::BytesCommitted] += systemCounters[EBasicCounter::BytesUsed];
-
-        auto hugeCounters = GetHugeAllocationCounters();
-        result[ETotalCounter::BytesCommitted] += hugeCounters[EHugeCounter::BytesUsed];
-
-        auto smallArenaCounters = GetSmallArenaAllocationCounters();
-        for (size_t rank = 0; rank < SmallRankCount; ++rank) {
-            result[ETotalCounter::BytesCommitted] += smallArenaCounters[rank][ESmallArenaCounter::BytesCommitted];
-        }
-
-        auto largeArenaCounters = GetLargeArenaAllocationCounters();
-        for (size_t rank = 0; rank < LargeRankCount; ++rank) {
-            result[ETotalCounter::BytesCommitted] += largeArenaCounters[rank][ELargeArenaCounter::BytesCommitted];
-        }
-
-        result[ETotalCounter::BytesUnaccounted] = std::max<ssize_t>(GetProcessRss() - result[ETotalCounter::BytesCommitted], 0);
-
-        return result;
-    }
-
-    TEnumIndexedArray<ESmallCounter, ssize_t> GetSmallAllocationCounters()
-    {
-        TEnumIndexedArray<ESmallCounter, ssize_t> result;
-
-        auto totalCounters = GetTotalAllocationCounters();
-        result[ESmallCounter::BytesAllocated] = totalCounters[ETotalCounter::BytesAllocated];
-        result[ESmallCounter::BytesFreed] = totalCounters[ETotalCounter::BytesFreed];
-        result[ESmallCounter::BytesUsed] = totalCounters[ETotalCounter::BytesUsed];
-
-        auto largeArenaCounters = GetLargeArenaAllocationCounters();
-        for (size_t rank = 0; rank < LargeRankCount; ++rank) {
-            result[ESmallCounter::BytesAllocated] -= largeArenaCounters[rank][ELargeArenaCounter::BytesAllocated];
-            result[ESmallCounter::BytesFreed] -= largeArenaCounters[rank][ELargeArenaCounter::BytesFreed];
-            result[ESmallCounter::BytesUsed] -= largeArenaCounters[rank][ELargeArenaCounter::BytesUsed];
-        }
-
-        auto hugeCounters = GetHugeAllocationCounters();
-        result[ESmallCounter::BytesAllocated] -= hugeCounters[EHugeCounter::BytesAllocated];
-        result[ESmallCounter::BytesFreed] -= hugeCounters[EHugeCounter::BytesFreed];
-        result[ESmallCounter::BytesUsed] -= hugeCounters[EHugeCounter::BytesUsed];
-
-        return result;
-    }
-
-    std::array<TLocalSmallCounters, SmallRankCount> GetSmallArenaAllocationCounters()
-    {
-        std::array<TLocalSmallCounters, SmallRankCount> result;
-        for (size_t rank = 0; rank < SmallRankCount; ++rank) {
-            for (auto counter : TEnumTraits<ESmallArenaCounter>::GetDomainValues()) {
-                result[rank][counter] = SmallArenaCounters_[rank][counter].load();
-            }
-        }
-        return result;
-    }
-
-    TEnumIndexedArray<ELargeCounter, ssize_t> GetLargeAllocationCounters()
-    {
-        TEnumIndexedArray<ELargeCounter, ssize_t> result;
-        auto largeArenaCounters = GetLargeArenaAllocationCounters();
-        for (size_t rank = 0; rank < LargeRankCount; ++rank) {
-            result[ESmallCounter::BytesAllocated] += largeArenaCounters[rank][ELargeArenaCounter::BytesAllocated];
-            result[ESmallCounter::BytesFreed] += largeArenaCounters[rank][ELargeArenaCounter::BytesFreed];
-            result[ESmallCounter::BytesUsed] += largeArenaCounters[rank][ELargeArenaCounter::BytesUsed];
-        }
-        return result;
-    }
-
-    std::array<TLocalLargeCounters, LargeRankCount> GetLargeArenaAllocationCounters()
-    {
-        std::array<TLocalLargeCounters, LargeRankCount> result{};
-
-        for (size_t rank = 0; rank < LargeRankCount; ++rank) {
-            for (auto counter : TEnumTraits<ELargeArenaCounter>::GetDomainValues()) {
-                result[rank][counter] = GlobalState->LargeArenaCounters[rank][counter].load();
-            }
-        }
-
-        ThreadManager->EnumerateThreadStatesAsync(
-            [&] (const auto* state) {
-                for (size_t rank = 0; rank < LargeRankCount; ++rank) {
-                    for (auto counter : TEnumTraits<ELargeArenaCounter>::GetDomainValues()) {
-                        result[rank][counter] += state->LargeArenaCounters[rank][counter];
-                    }
-                }
-            });
-
-        for (size_t rank = 0; rank < LargeRankCount; ++rank) {
-            result[rank][ELargeArenaCounter::BytesUsed] = GetUsed(result[rank][ELargeArenaCounter::BytesAllocated], result[rank][ELargeArenaCounter::BytesFreed]);
-            result[rank][ELargeArenaCounter::BlobsUsed] = GetUsed(result[rank][ELargeArenaCounter::BlobsAllocated], result[rank][ELargeArenaCounter::BlobsFreed]);
-        }
-
-        return result;
-    }
-
-    TLocalSystemCounters GetSystemAllocationCounters()
-    {
-        TLocalSystemCounters result;
-        for (auto counter : TEnumTraits<ESystemCounter>::GetDomainValues()) {
-            result[counter] = SystemCounters_[counter].load();
-        }
-        result[ESystemCounter::BytesUsed] = GetUsed(result[ESystemCounter::BytesAllocated], result[ESystemCounter::BytesFreed]);
-        return result;
-    }
-
-    TLocalHugeCounters GetHugeAllocationCounters()
-    {
-        TLocalHugeCounters result;
-        for (auto counter : TEnumTraits<EHugeCounter>::GetDomainValues()) {
-            result[counter] = HugeCounters_[counter].load();
-        }
-        result[EHugeCounter::BytesUsed] = GetUsed(result[EHugeCounter::BytesAllocated], result[EHugeCounter::BytesFreed]);
-        result[EHugeCounter::BlobsUsed] = GetUsed(result[EHugeCounter::BlobsAllocated], result[EHugeCounter::BlobsFreed]);
-        return result;
-    }
-
-    TLocalUndumpableCounters GetUndumpableAllocationCounters()
-    {
-        TLocalUndumpableCounters result;
-        for (auto counter : TEnumTraits<EUndumpableCounter>::GetDomainValues()) {
-            result[counter] = HugeUndumpableCounters_[counter].load();
-            result[counter] += GlobalState->UndumpableCounters[counter].load();
-        }
-
-        ThreadManager->EnumerateThreadStatesAsync(
-            [&] (const auto* state) {
-                result[EUndumpableCounter::BytesAllocated] += LoadCounter(state->UndumpableCounters[EUndumpableCounter::BytesAllocated]);
-                result[EUndumpableCounter::BytesFreed] += LoadCounter(state->UndumpableCounters[EUndumpableCounter::BytesFreed]);
-            });
-
-        result[EUndumpableCounter::BytesUsed] = GetUsed(result[EUndumpableCounter::BytesAllocated], result[EUndumpableCounter::BytesFreed]);
-        return result;
-    }
-
-    // Called before TThreadState is destroyed.
-    // Adds the counter values from TThreadState to the global counters.
-    void AccumulateLocalCounters(TThreadState* state)
-    {
-        for (auto counter : TEnumTraits<EBasicCounter>::GetDomainValues()) {
-            GlobalState->TotalCounters.CumulativeTaggedCounters[counter] += state->TotalCounters.CumulativeTaggedCounters[counter];
-            GlobalState->TotalCounters.UntaggedCounters[counter] += state->TotalCounters.UntaggedCounters[counter];
-        }
-        for (size_t index = 0; index < MaxTaggedCounterSets; ++index) {
-            const auto* localSet = state->TotalCounters.FindTaggedCounterSet(index);
-            if (!localSet) {
-                continue;
-            }
-            auto* globalSet = GlobalState->TotalCounters.GetOrCreateTaggedCounterSet(index);
-            for (size_t jndex = 0; jndex < TaggedCounterSetSize; ++jndex) {
-                for (auto counter : TEnumTraits<EBasicCounter>::GetDomainValues()) {
-                    globalSet->Counters[jndex][counter] += localSet->Counters[jndex][counter];
-                }
-            }
-        }
-        for (size_t rank = 0; rank < LargeRankCount; ++rank) {
-            for (auto counter : TEnumTraits<ELargeArenaCounter>::GetDomainValues()) {
-                GlobalState->LargeArenaCounters[rank][counter] += state->LargeArenaCounters[rank][counter];
-            }
-        }
-        for (auto counter : TEnumTraits<EUndumpableCounter>::GetDomainValues()) {
-            GlobalState->UndumpableCounters[counter] += state->UndumpableCounters[counter];
-        }
-    }
-
-private:
-    template <class TCounter>
-    static ssize_t LoadTaggedTotalCounter(const TTotalCounters<TCounter>& counters, TMemoryTag tag, EBasicCounter counter)
-    {
-        const auto* set = counters.FindTaggedCounterSet(tag / TaggedCounterSetSize);
-        if (Y_UNLIKELY(!set)) {
-            return 0;
-        }
-        return LoadCounter(set->Counters[tag % TaggedCounterSetSize][counter]);
-    }
-
-    template <class TCounter>
-    static Y_FORCE_INLINE void IncrementUntaggedTotalCounter(TTotalCounters<TCounter>* counters, EBasicCounter counter, ssize_t delta)
-    {
-        counters->UntaggedCounters[counter] += delta;
-    }
-
-    template <class TCounter>
-    static Y_FORCE_INLINE void IncrementTaggedTotalCounter(TTotalCounters<TCounter>* counters, TMemoryTag tag, EBasicCounter counter, ssize_t delta)
-    {
-        counters->CumulativeTaggedCounters[counter] += delta;
-        auto* set = counters->GetOrCreateTaggedCounterSet(tag / TaggedCounterSetSize);
-        set->Counters[tag % TaggedCounterSetSize][counter] += delta;
-    }
-
-
-    static ssize_t GetProcessRss()
-    {
-        auto* file = ::fopen("/proc/self/statm", "r");
-        if (!file) {
-            return 0;
-        }
-
-        ssize_t dummy;
-        ssize_t rssPages;
-        auto readResult = fscanf(file, "%zd %zd", &dummy, &rssPages);
-
-        ::fclose(file);
-
-        if (readResult != 2) {
-            return 0;
-        }
-
-        return rssPages * PageSize;
-    }
-
-private:
-    TGlobalSystemCounters SystemCounters_;
-    std::array<TGlobalSmallCounters, SmallRankCount> SmallArenaCounters_;
-    TGlobalHugeCounters HugeCounters_;
-    TGlobalUndumpableCounters HugeUndumpableCounters_;
-};
-
-TExplicitlyConstructableSingleton<TStatisticsManager> StatisticsManager;
-
-////////////////////////////////////////////////////////////////////////////////
-
-void* TSystemAllocator::Allocate(size_t size)
-{
-    auto rawSize = GetRawBlobSize<TSystemBlobHeader>(size);
-    void* mmappedPtr;
-    while (true) {
-        auto currentPtr = CurrentPtr_.fetch_add(rawSize);
-        Y_ABORT_UNLESS(currentPtr + rawSize <= SystemZoneEnd);
-        mmappedPtr = MappedMemoryManager->Map(
-            currentPtr,
-            rawSize,
-            MAP_FIXED_NOREPLACE | MAP_POPULATE);
-        if (mmappedPtr == reinterpret_cast<void*>(currentPtr)) {
-            break;
-        }
-        if (mmappedPtr != MAP_FAILED) {
-            MappedMemoryManager->Unmap(mmappedPtr, rawSize);
-        }
-    }
-    auto* blob = static_cast<TSystemBlobHeader*>(mmappedPtr);
-    new (blob) TSystemBlobHeader(size);
-    auto* result = HeaderToPtr(blob);
-    PoisonUninitializedRange(result, size);
-    StatisticsManager->IncrementSystemCounter(ESystemCounter::BytesAllocated, rawSize);
-    return result;
-}
-
-void TSystemAllocator::Free(void* ptr)
-{
-    auto* blob = PtrToHeader<TSystemBlobHeader>(ptr);
-    auto rawSize = GetRawBlobSize<TSystemBlobHeader>(blob->Size);
-    MappedMemoryManager->Unmap(blob, rawSize);
-    StatisticsManager->IncrementSystemCounter(ESystemCounter::BytesFreed, rawSize);
-}
-
-////////////////////////////////////////////////////////////////////////////////
-// Small allocator
-//
-// Allocations (called small chunks) are grouped by their sizes. Two most-significant binary digits are
-// used to determine the rank of a chunk, which guarantees 25% overhead in the worst case.
-// A pair of helper arrays (SizeToSmallRank1 and SizeToSmallRank2) are used to compute ranks; we expect
-// them to be permanently cached.
-//
-// Chunks of the same rank are served by a (small) arena allocator.
-// In fact, there are two arenas for each rank: one is for tagged allocations and another is for untagged ones.
-//
-// We encode chunk's rank and whether it is tagged or not in the resulting pointer as follows:
-//   0- 3:  must be zero due to alignment
-//   4-39:  varies
-//  40-44:  rank
-//     45:  0 for untagged allocations, 1 for tagged ones
-//  45-63:  zeroes
-// This enables computing chunk's rank and also determining if it is tagged in constant time
-// without any additional lookups. Also, one pays no space overhead for untagged allocations
-// and pays 16 bytes for each tagged one.
-//
-// Each arena allocates extents of memory by calling mmap for each extent of SmallExtentSize bytes.
-// (Recall that this memory is never reclaimed.)
-// Each extent is then sliced into segments of SmallSegmentSize bytes.
-// Whenever a new segment is acquired, its memory is pre-faulted by madvise(MADV_POPULATE).
-// New segments are acquired in a lock-free manner.
-//
-// Each thread maintains a separate cache of chunks of each rank (two caches to be precise: one
-// for tagged allocations and the other for untagged). These caches are fully thread-local and
-// involve no atomic operations.
-//
-// There are also global caches (per rank, for tagged and untagged allocations).
-// Instead of keeping individual chunks these work with chunk groups (collections of up to ChunksPerGroup
-// arbitrary chunks).
-//
-// When the local cache becomes exhausted, a group of chunks is fetched from the global cache
-// (if the latter is empty then the arena allocator is consulted).
-// Vice versa, if the local cache overflows, a group of chunks is moved from it to the global cache.
-//
-// Global caches and arena allocators also take care of (rare) cases when Allocate/Free is called
-// without a valid thread state (which happens during thread shutdown when TThreadState is already destroyed).
-//
-// Each arena allocates memory in a certain "data" zone of SmallZoneSize.
-// In addition to that zone, up to two "shadow" zones are maintained.
-//
-// The first one contains memory tags of chunks residing in the primary zone.
-// The second one (which is present if YTALLOC_NERVOUS is defined) contains
-// states of chunks. These states enable some simple internal sanity checks
-// (e.g. detect attempts to double-free a chunk).
-//
-// Addresses in the data zone are directly mapped to offsets in shadow zones.
-// When a segment of a small arena zone is allocated, the relevant portions of shadow
-// zones get initialized (and also accounted for as a system allocation).
-//
-// Shadow zones are memory-mapped with MAP_NORESERVE flag and are quite sparse.
-// These zones are omitted from core dumps due to their huge size and sparsity.
-
-// For each small rank i, gives max K such that 2^k <= SmallRankToSize[i].
-// Chunk pointer is mapped to its shadow image via GetShadowOffset helper.
-// Note that chunk size is not always a power of 2. To avoid costly integer division,
-// chunk pointer is translated by means of bitwise shift only (leaving some bytes
-// of shadow zones unused). This array provides the needed shifts.
-constexpr int SmallRankToLogSize[SmallRankCount] = {
-    0,
-    4, 5, 5, 6, 6, 7,
-    7, 8, 8, 9, 9, 10, 10, 11,
-    11, 12, 12, 13, 13, 14, 14, 15
-};
-
-enum class ESmallChunkState : ui8
-{
-    Spare         = 0,
-    Allocated     = 0x61, // a
-    Freed         = 0x66  // f
-};
-
-class TSmallArenaAllocator
-{
-public:
-    TSmallArenaAllocator(EAllocationKind kind, size_t rank, uintptr_t dataZoneStart)
-        : Kind_(kind)
-        , Rank_(rank)
-        , LogSize_(SmallRankToLogSize[Rank_])
-        , ChunkSize_(SmallRankToSize[Rank_])
-        , DataZoneStart_(dataZoneStart)
-        , DataZoneAllocator_(DataZoneStart_, DataZoneStart_ + SmallZoneSize)
-    { }
-
-    size_t PullMany(void** batch, size_t maxCount)
-    {
-        size_t count;
-        while (true) {
-            count = TryAllocateFromCurrentExtent(batch, maxCount);
-            if (Y_LIKELY(count != 0)) {
-                break;
-            }
-            PopulateAnotherExtent();
-        }
-        return count;
-    }
-
-    void* Allocate(size_t size)
-    {
-        void* ptr;
-        auto count = PullMany(&ptr, 1);
-        YTALLOC_PARANOID_ASSERT(count == 1);
-        YTALLOC_PARANOID_ASSERT(PtrToSmallRank(ptr) == Rank_);
-        PoisonUninitializedRange(ptr, size);
-        UpdateChunkState(ptr, ESmallChunkState::Freed, ESmallChunkState::Allocated);
-        return ptr;
-    }
-
-    TMemoryTag GetAndResetMemoryTag(const void* ptr)
-    {
-        auto& tag = MemoryTagZoneStart_[GetShadowOffset(ptr)];
-        auto currentTag = tag;
-        tag = NullMemoryTag;
-        return currentTag;
-    }
-
-    void SetMemoryTag(void* ptr, TMemoryTag tag)
-    {
-        MemoryTagZoneStart_[GetShadowOffset(ptr)] = tag;
-    }
-
-    void UpdateChunkState(const void* ptr, ESmallChunkState expectedState, ESmallChunkState newState)
-    {
-#ifdef YTALLOC_NERVOUS
-        auto& state = ChunkStateZoneStart_[GetShadowOffset(ptr)];
-        auto actualState = state;
-        if (Y_UNLIKELY(actualState != expectedState)) {
-            char message[256];
-            snprintf(message, sizeof(message), "Invalid small chunk state at %p: expected %" PRIx8 ", actual %" PRIx8,
-                ptr,
-                static_cast<ui8>(expectedState),
-                static_cast<ui8>(actualState));
-            YTALLOC_TRAP(message);
-        }
-        state = newState;
-#else
-        Y_UNUSED(ptr);
-        Y_UNUSED(expectedState);
-        Y_UNUSED(newState);
-#endif
-    }
-
-private:
-    size_t TryAllocateFromCurrentExtent(void** batch, size_t maxCount)
-    {
-        auto* oldPtr = CurrentPtr_.load();
-        if (Y_UNLIKELY(!oldPtr)) {
-            return 0;
-        }
-
-        auto* currentExtent = CurrentExtent_.load(std::memory_order_relaxed);
-        if (Y_UNLIKELY(!currentExtent)) {
-            return 0;
-        }
-
-        char* newPtr;
-        while (true) {
-            if (Y_UNLIKELY(oldPtr < currentExtent || oldPtr + ChunkSize_ + RightReadableAreaSize > currentExtent + SmallExtentSize)) {
-                return 0;
-            }
-
-            newPtr = std::min(
-                oldPtr + ChunkSize_ * maxCount,
-                currentExtent + SmallExtentSize);
-
-            auto* alignedNewPtr = AlignDownToSmallSegment(currentExtent, newPtr);
-            if (alignedNewPtr > oldPtr) {
-                newPtr = alignedNewPtr;
-            }
-
-            if (Y_LIKELY(CurrentPtr_.compare_exchange_weak(oldPtr, newPtr))) {
-                break;
-            }
-        }
-
-        auto* firstSegment = AlignUpToSmallSegment(currentExtent, oldPtr);
-        auto* nextSegment = AlignUpToSmallSegment(currentExtent, newPtr);
-        if (firstSegment != nextSegment) {
-            auto size = nextSegment - firstSegment;
-            MappedMemoryManager->PopulateReadOnly(firstSegment, size);
-
-            StatisticsManager->IncrementSmallArenaCounter(ESmallArenaCounter::BytesCommitted, Rank_, size);
-            StatisticsManager->IncrementSmallArenaCounter(ESmallArenaCounter::PagesCommitted, Rank_, size / PageSize);
-            if (Kind_ == EAllocationKind::Tagged) {
-                StatisticsManager->IncrementSystemCounter(ESystemCounter::BytesAllocated, size / ChunkSize_ * sizeof(TMemoryTag));
-            }
-#ifdef YTALLOC_NERVOUS
-            StatisticsManager->IncrementSystemCounter(ESystemCounter::BytesAllocated, size / ChunkSize_ * sizeof(ESmallChunkState));
-#endif
-        }
-
-        size_t count = 0;
-        while (oldPtr != newPtr) {
-            UpdateChunkState(oldPtr, ESmallChunkState::Spare, ESmallChunkState::Freed);
-
-            batch[count] = oldPtr;
-
-            oldPtr += ChunkSize_;
-            count++;
-        }
-        return count;
-    }
-
-    void PopulateAnotherExtent()
-    {
-        auto lockGuard = GuardWithTiming(ExtentLock_);
-
-        auto* currentPtr = CurrentPtr_.load();
-        auto* currentExtent = CurrentExtent_.load();
-
-        if (currentPtr && currentPtr + ChunkSize_ + RightReadableAreaSize <= currentExtent + SmallExtentSize) {
-            // No need for a new extent.
-            return;
-        }
-
-        auto* newExtent = static_cast<char*>(DataZoneAllocator_.Allocate(SmallExtentAllocSize, 0));
-
-        AllocateShadowZones();
-
-        YTALLOC_VERIFY(reinterpret_cast<uintptr_t>(newExtent) % SmallExtentAllocSize == 0);
-        CurrentPtr_ = CurrentExtent_ = newExtent;
-
-        StatisticsManager->IncrementSmallArenaCounter(ESmallArenaCounter::BytesMapped, Rank_, SmallExtentAllocSize);
-        StatisticsManager->IncrementSmallArenaCounter(ESmallArenaCounter::PagesMapped, Rank_, SmallExtentAllocSize / PageSize);
-    }
-
-private:
-    const EAllocationKind Kind_;
-    const size_t Rank_;
-    const size_t LogSize_;
-    const size_t ChunkSize_;
-    const uintptr_t DataZoneStart_;
-
-    TZoneAllocator DataZoneAllocator_;
-
-    bool ShadowZonesAllocated_ = false;
-    TMemoryTag* MemoryTagZoneStart_;
-#ifdef YTALLOC_NERVOUS
-    ESmallChunkState* ChunkStateZoneStart_;
-#endif
-
-    NThreading::TForkAwareSpinLock ExtentLock_;
-    std::atomic<char*> CurrentPtr_ = nullptr;
-    std::atomic<char*> CurrentExtent_ = nullptr;
-
-    size_t GetShadowOffset(const void* ptr)
-    {
-        return (reinterpret_cast<uintptr_t>(ptr) - DataZoneStart_) >> LogSize_;
-    }
-
-    void AllocateShadowZones()
-    {
-        if (ShadowZonesAllocated_) {
-            return;
-        }
-
-        if (Kind_ == EAllocationKind::Tagged) {
-            MemoryTagZoneStart_ = MapShadowZone<TMemoryTag>();
-        }
-#ifdef YTALLOC_NERVOUS
-        ChunkStateZoneStart_ = MapShadowZone<ESmallChunkState>();
-#endif
-
-        ShadowZonesAllocated_ = true;
-    }
-
-    template <class T>
-    T* MapShadowZone()
-    {
-        auto size = AlignUp((SmallZoneSize >> LogSize_) * sizeof (T), PageSize);
-        auto* ptr = static_cast<T*>(MappedMemoryManager->Map(SystemZoneStart, size, MAP_NORESERVE));
-        MappedMemoryManager->DontDump(ptr, size);
-        return ptr;
-    }
-};
-
-TExplicitlyConstructableSingleton<TEnumIndexedArray<EAllocationKind, std::array<TExplicitlyConstructableSingleton<TSmallArenaAllocator>, SmallRankCount>>> SmallArenaAllocators;
-
-////////////////////////////////////////////////////////////////////////////////
-
-constexpr size_t ChunksPerGroup = 128;
-constexpr size_t GroupsBatchSize = 1024;
-
-static_assert(ChunksPerGroup <= MaxCachedChunksPerRank, "ChunksPerGroup > MaxCachedChunksPerRank");
-
-class TChunkGroup
-    : public TFreeListItemBase<TChunkGroup>
-{
-public:
-    bool IsEmpty() const
-    {
-        return Size_ == 0;
-    }
-
-    size_t ExtractAll(void** ptrs)
-    {
-        auto count = Size_;
-        ::memcpy(ptrs, Ptrs_.data(), count * sizeof(void*));
-        Size_ = 0;
-        return count;
-    }
-
-    void PutOne(void* ptr)
-    {
-        PutMany(&ptr, 1);
-    }
-
-    void PutMany(void** ptrs, size_t count)
-    {
-        YTALLOC_PARANOID_ASSERT(Size_ == 0);
-        YTALLOC_PARANOID_ASSERT(count <= ChunksPerGroup);
-        ::memcpy(Ptrs_.data(), ptrs, count * sizeof(void*));
-        Size_ = count;
-    }
-
-private:
-    size_t Size_ = 0; // <= ChunksPerGroup
-    std::array<void*, ChunksPerGroup> Ptrs_;
-};
-
-class TGlobalSmallChunkCache
-{
-public:
-    explicit TGlobalSmallChunkCache(EAllocationKind kind)
-        : Kind_(kind)
-    { }
-
-#ifdef YTALLOC_PARANOID
-    void CanonizeChunkPtrs(TThreadState* state, size_t rank)
-    {
-        auto& chunkPtrPtr = state->SmallBlobCache[Kind_].RankToCachedChunkPtrHead[rank];
-
-        auto leftBorder = state->SmallBlobCache[Kind_].RankToCachedChunkLeftBorder[rank];
-        auto rightBorder = state->SmallBlobCache[Kind_].RankToCachedChunkRightBorder[rank];
-
-        state->SmallBlobCache[Kind_].CachedChunkFull[rank] = false;
-        if (chunkPtrPtr + 1 == rightBorder) {
-            chunkPtrPtr = leftBorder;
-            state->SmallBlobCache[Kind_].CachedChunkFull[rank] = true;
-        }
-
-        state->SmallBlobCache[Kind_].RankToCachedChunkPtrTail[rank] = leftBorder;
-    }
-#endif
-
-    bool TryMoveGroupToLocal(TThreadState* state, size_t rank)
-    {
-        auto& groups = RankToChunkGroups_[rank];
-        auto* group = groups.Extract(state);
-        if (!Y_LIKELY(group)) {
-            return false;
-        }
-
-        YTALLOC_PARANOID_ASSERT(!group->IsEmpty());
-
-        auto& chunkPtrPtr = state->SmallBlobCache[Kind_].RankToCachedChunkPtrHead[rank];
-#ifdef YTALLOC_PARANOID
-        chunkPtrPtr = state->SmallBlobCache[Kind_].RankToCachedChunkLeftBorder[rank];
-        state->SmallBlobCache[Kind_].RankToCachedChunkPtrTail[rank] = chunkPtrPtr;
-#endif
-        auto chunkCount = group->ExtractAll(chunkPtrPtr + 1);
-        chunkPtrPtr += chunkCount;
-
-#ifdef YTALLOC_PARANOID
-        CanonizeChunkPtrs(state, rank);
-#endif
-        GroupPool_.Free(state, group);
-        return true;
-    }
-
-    void MoveGroupToGlobal(TThreadState* state, size_t rank)
-    {
-        auto* group = GroupPool_.Allocate(state);
-
-        auto& chunkPtrPtr = state->SmallBlobCache[Kind_].RankToCachedChunkPtrHead[rank];
-        YTALLOC_PARANOID_ASSERT(*(chunkPtrPtr + 1) == reinterpret_cast<void*>(TThreadState::RightSentinel));
-        group->PutMany(chunkPtrPtr - ChunksPerGroup + 1, ChunksPerGroup);
-        chunkPtrPtr -= ChunksPerGroup;
-#ifdef YTALLOC_PARANOID
-        ::memset(chunkPtrPtr + 1, 0, sizeof(void*) * ChunksPerGroup);
-        CanonizeChunkPtrs(state, rank);
-#endif
-
-        auto& groups = RankToChunkGroups_[rank];
-        YTALLOC_PARANOID_ASSERT(!group->IsEmpty());
-        groups.Put(state, group);
-    }
-
-    void MoveOneToGlobal(void* ptr, size_t rank)
-    {
-        auto* group = GroupPool_.Allocate(&GlobalShardedState_);
-        group->PutOne(ptr);
-
-        auto& groups = RankToChunkGroups_[rank];
-        YTALLOC_PARANOID_ASSERT(!group->IsEmpty());
-        groups.Put(&GlobalShardedState_, group);
-    }
-
-#ifdef YTALLOC_PARANOID
-    void MoveAllToGlobal(TThreadState* state, size_t rank)
-    {
-        auto leftSentinelBorder = state->SmallBlobCache[Kind_].RankToCachedChunkLeftBorder[rank];
-        auto rightSentinelBorder = state->SmallBlobCache[Kind_].RankToCachedChunkRightBorder[rank];
-
-        auto& headPtr = state->SmallBlobCache[Kind_].RankToCachedChunkPtrHead[rank];
-        auto& tailPtr = state->SmallBlobCache[Kind_].RankToCachedChunkPtrTail[rank];
-
-        if (tailPtr == headPtr && !state->SmallBlobCache[Kind_].CachedChunkFull[rank]) {
-            headPtr = leftSentinelBorder;
-            return;
-        }
-
-        // (leftBorder, rightBorder]
-        auto moveIntervalToGlobal = [=] (void** leftBorder, void** rightBorder) {
-            while (true) {
-                size_t count = 0;
-                while (count < ChunksPerGroup && rightBorder != leftBorder) {
-                    --rightBorder;
-                    ++count;
-                }
-
-                if (count == 0) {
-                    break;
-                }
-
-                auto* group = GroupPool_.Allocate(state);
-                group->PutMany(rightBorder + 1, count);
-                ::memset(rightBorder + 1, 0, sizeof(void*) * count);
-                auto& groups = RankToChunkGroups_[rank];
-                groups.Put(state, group);
-            }
-        };
-
-        if (tailPtr >= headPtr) {
-            moveIntervalToGlobal(tailPtr, rightSentinelBorder - 1);
-            moveIntervalToGlobal(leftSentinelBorder, headPtr);
-        } else {
-            moveIntervalToGlobal(tailPtr, headPtr);
-        }
-
-        headPtr = leftSentinelBorder;
-    }
-#else
-    void MoveAllToGlobal(TThreadState* state, size_t rank)
-    {
-        auto& chunkPtrPtr = state->SmallBlobCache[Kind_].RankToCachedChunkPtrHead[rank];
-        while (true) {
-            size_t count = 0;
-            while (count < ChunksPerGroup && *chunkPtrPtr != reinterpret_cast<void*>(TThreadState::LeftSentinel)) {
-                --chunkPtrPtr;
-                ++count;
-            }
-
-            if (count == 0) {
-                break;
-            }
-
-            auto* group = GroupPool_.Allocate(state);
-            group->PutMany(chunkPtrPtr + 1, count);
-            auto& groups = RankToChunkGroups_[rank];
-            groups.Put(state, group);
-        }
-    }
-#endif
-
-private:
-    const EAllocationKind Kind_;
-
-    TGlobalShardedState GlobalShardedState_;
-    TShardedSystemPool<TChunkGroup, GroupsBatchSize> GroupPool_;
-    std::array<TShardedFreeList<TChunkGroup>, SmallRankCount> RankToChunkGroups_;
-};
-
-TExplicitlyConstructableSingleton<TEnumIndexedArray<EAllocationKind, TExplicitlyConstructableSingleton<TGlobalSmallChunkCache>>> GlobalSmallChunkCaches;
-
-////////////////////////////////////////////////////////////////////////////////
-
-class TSmallAllocator
-{
-public:
-    template <EAllocationKind Kind>
-    static Y_FORCE_INLINE void* Allocate(TMemoryTag tag, size_t rank)
-    {
-        auto* state = TThreadManager::FindThreadState();
-        if (Y_LIKELY(state)) {
-            return Allocate<Kind>(tag, rank, state);
-        }
-        auto size = SmallRankToSize[rank];
-        return AllocateGlobal<Kind>(tag, rank, size);
-    }
-
-#ifdef YTALLOC_PARANOID
-    template <EAllocationKind Kind>
-    static Y_FORCE_INLINE void* Allocate(TMemoryTag tag, size_t rank, TThreadState* state)
-    {
-        auto& localCache = state->SmallBlobCache[Kind];
-        auto& allocator = *(*SmallArenaAllocators)[Kind][rank];
-
-        size_t size = SmallRankToSize[rank];
-        StatisticsManager->IncrementTotalCounter<Kind>(state, tag, EBasicCounter::BytesAllocated, size);
-
-        auto leftBorder = localCache.RankToCachedChunkLeftBorder[rank];
-        auto rightBorder = localCache.RankToCachedChunkRightBorder[rank];
-
-        void* result;
-        while (true) {
-            auto& chunkHeadPtr = localCache.RankToCachedChunkPtrHead[rank];
-            auto& cachedHeadPtr = *(chunkHeadPtr + 1);
-            auto* headPtr = cachedHeadPtr;
-
-            auto& chunkTailPtr = localCache.RankToCachedChunkPtrTail[rank];
-            auto& cachedTailPtr = *(chunkTailPtr + 1);
-            auto* tailPtr = cachedTailPtr;
-
-            auto& chunkFull = localCache.CachedChunkFull[rank];
-
-            if (Y_LIKELY(chunkFull || headPtr != tailPtr)) {
-                YTALLOC_PARANOID_ASSERT(tailPtr);
-                cachedTailPtr = nullptr;
-                ++chunkTailPtr;
-                if (Y_LIKELY(chunkTailPtr + 1 == rightBorder)) {
-                    chunkTailPtr = leftBorder;
-                }
-
-                chunkFull = false;
-                result = tailPtr;
-                PoisonUninitializedRange(result, size);
-                allocator.UpdateChunkState(result, ESmallChunkState::Freed, ESmallChunkState::Allocated);
-                break;
-            }
-
-            auto& globalCache = *(*GlobalSmallChunkCaches)[Kind];
-            if (!globalCache.TryMoveGroupToLocal(state, rank)) {
-                result = allocator.Allocate(size);
-                break;
-            }
-        }
-
-        if constexpr(Kind == EAllocationKind::Tagged) {
-            allocator.SetMemoryTag(result, tag);
-        }
-
-        return result;
-    }
-
-    template <EAllocationKind Kind>
-    static Y_FORCE_INLINE void Free(void* ptr)
-    {
-        auto rank = PtrToSmallRank(ptr);
-        auto size = SmallRankToSize[rank];
-
-        auto& allocator = *(*SmallArenaAllocators)[Kind][rank];
-
-        auto tag = NullMemoryTag;
-        if constexpr(Kind == EAllocationKind::Tagged) {
-            tag = allocator.GetAndResetMemoryTag(ptr);
-            YTALLOC_PARANOID_ASSERT(tag != NullMemoryTag);
-        }
-
-        allocator.UpdateChunkState(ptr, ESmallChunkState::Allocated, ESmallChunkState::Freed);
-        PoisonFreedRange(ptr, size);
-
-        auto* state = TThreadManager::FindThreadState();
-        if (Y_UNLIKELY(!state)) {
-            FreeGlobal<Kind>(tag, ptr, rank, size);
-            return;
-        }
-
-        StatisticsManager->IncrementTotalCounter<Kind>(state, tag, EBasicCounter::BytesFreed, size);
-
-        auto& localCache = state->SmallBlobCache[Kind];
-
-        auto leftBorder = localCache.RankToCachedChunkLeftBorder[rank];
-        auto rightBorder = localCache.RankToCachedChunkRightBorder[rank];
-
-        while (true) {
-            auto& chunkHeadPtr = localCache.RankToCachedChunkPtrHead[rank];
-            auto& headPtr = *(chunkHeadPtr + 1);
-
-            auto& chunkTailPtr = localCache.RankToCachedChunkPtrTail[rank];
-            auto& chunkFull = localCache.CachedChunkFull[rank];
-
-            if (Y_LIKELY(!chunkFull)) {
-                headPtr = ptr;
-                ++chunkHeadPtr;
-                if (Y_LIKELY(chunkHeadPtr + 1 == rightBorder)) {
-                    chunkHeadPtr = leftBorder;
-                }
-                chunkFull = (chunkHeadPtr == chunkTailPtr);
-                break;
-            }
-
-            chunkHeadPtr = rightBorder - 1;
-            chunkTailPtr = leftBorder;
-
-            auto& globalCache = *(*GlobalSmallChunkCaches)[Kind];
-            globalCache.MoveGroupToGlobal(state, rank);
-        }
-    }
-
-#else
-
-    template <EAllocationKind Kind>
-    static Y_FORCE_INLINE void* Allocate(TMemoryTag tag, size_t rank, TThreadState* state)
-    {
-        size_t size = SmallRankToSize[rank];
-        StatisticsManager->IncrementTotalCounter<Kind>(state, tag, EBasicCounter::BytesAllocated, size);
-
-        auto& localCache = state->SmallBlobCache[Kind];
-        auto& allocator = *(*SmallArenaAllocators)[Kind][rank];
-
-        void* result;
-        while (true) {
-            auto& chunkPtr = localCache.RankToCachedChunkPtrHead[rank];
-            auto& cachedPtr = *chunkPtr;
-            auto* ptr = cachedPtr;
-            if (Y_LIKELY(ptr != reinterpret_cast<void*>(TThreadState::LeftSentinel))) {
-                --chunkPtr;
-                result = ptr;
-                allocator.UpdateChunkState(result, ESmallChunkState::Freed, ESmallChunkState::Allocated);
-                PoisonUninitializedRange(result, size);
-                break;
-            }
-
-            auto& globalCache = *(*GlobalSmallChunkCaches)[Kind];
-            if (globalCache.TryMoveGroupToLocal(state, rank)) {
-                continue;
-            }
-
-            auto count = allocator.PullMany(
-                chunkPtr + 1,
-                SmallRankBatchSize[rank]);
-            chunkPtr += count;
-        }
-
-        if constexpr(Kind == EAllocationKind::Tagged) {
-            allocator.SetMemoryTag(result, tag);
-        }
-
-        return result;
-    }
-
-    template <EAllocationKind Kind>
-    static Y_FORCE_INLINE void Free(void* ptr)
-    {
-        auto rank = PtrToSmallRank(ptr);
-        auto size = SmallRankToSize[rank];
-
-        auto& allocator = *(*SmallArenaAllocators)[Kind][rank];
-
-        auto tag = NullMemoryTag;
-        if constexpr(Kind == EAllocationKind::Tagged) {
-            tag = allocator.GetAndResetMemoryTag(ptr);
-            YTALLOC_PARANOID_ASSERT(tag != NullMemoryTag);
-        }
-
-        allocator.UpdateChunkState(ptr, ESmallChunkState::Allocated, ESmallChunkState::Freed);
-        PoisonFreedRange(ptr, size);
-
-        auto* state = TThreadManager::FindThreadState();
-        if (Y_UNLIKELY(!state)) {
-            FreeGlobal<Kind>(tag, ptr, rank, size);
-            return;
-        }
-
-        StatisticsManager->IncrementTotalCounter<Kind>(state, tag, EBasicCounter::BytesFreed, size);
-
-        auto& localCache = state->SmallBlobCache[Kind];
-
-        while (true) {
-            auto& chunkPtrPtr = localCache.RankToCachedChunkPtrHead[rank];
-            auto& chunkPtr = *(chunkPtrPtr + 1);
-            if (Y_LIKELY(chunkPtr != reinterpret_cast<void*>(TThreadState::RightSentinel))) {
-                chunkPtr = ptr;
-                ++chunkPtrPtr;
-                break;
-            }
-
-            auto& globalCache = *(*GlobalSmallChunkCaches)[Kind];
-            globalCache.MoveGroupToGlobal(state, rank);
-        }
-    }
-#endif
-
-    static size_t GetAllocationSize(const void* ptr)
-    {
-        return SmallRankToSize[PtrToSmallRank(ptr)];
-    }
-
-    static size_t GetAllocationSize(size_t size)
-    {
-        return SmallRankToSize[SizeToSmallRank(size)];
-    }
-
-    static void PurgeCaches()
-    {
-        DoPurgeCaches<EAllocationKind::Untagged>();
-        DoPurgeCaches<EAllocationKind::Tagged>();
-    }
-
-private:
-    template <EAllocationKind Kind>
-    static void DoPurgeCaches()
-    {
-        auto* state = TThreadManager::GetThreadStateChecked();
-        for (size_t rank = 0; rank < SmallRankCount; ++rank) {
-            (*GlobalSmallChunkCaches)[Kind]->MoveAllToGlobal(state, rank);
-        }
-    }
-
-    template <EAllocationKind Kind>
-    static void* AllocateGlobal(TMemoryTag tag, size_t rank, size_t size)
-    {
-        StatisticsManager->IncrementTotalCounter(tag, EBasicCounter::BytesAllocated, size);
-
-        auto& allocator = *(*SmallArenaAllocators)[Kind][rank];
-        auto* result = allocator.Allocate(size);
-
-        if constexpr(Kind == EAllocationKind::Tagged) {
-            allocator.SetMemoryTag(result, tag);
-        }
-
-        return result;
-    }
-
-    template <EAllocationKind Kind>
-    static void FreeGlobal(TMemoryTag tag, void* ptr, size_t rank, size_t size)
-    {
-        StatisticsManager->IncrementTotalCounter(tag, EBasicCounter::BytesFreed, size);
-
-        auto& globalCache = *(*GlobalSmallChunkCaches)[Kind];
-        globalCache.MoveOneToGlobal(ptr, rank);
-    }
-};
-
-////////////////////////////////////////////////////////////////////////////////
-// Large blob allocator
-//
-// Like for small chunks, large blobs are grouped into arenas, where arena K handles
-// blobs of size (2^{K-1},2^K]. Memory is mapped in extents of LargeExtentSize bytes.
-// Each extent is split into segments of size 2^K (here segment is just a memory region, which may fully consist of
-// unmapped pages). When a segment is actually allocated, it becomes a blob and a TLargeBlobHeader
-// structure is placed at its start.
-//
-// When an extent is allocated, it is sliced into segments (not blobs, since no headers are placed and
-// no memory is touched). These segments are put into disposed segments list.
-//
-// For each blob two separate sizes are maintained: BytesAcquired indicates the number of bytes
-// acquired via madvise(MADV_POPULATE) from the system; BytesAllocated (<= BytesAcquired) corresponds
-// to the number of bytes claimed by the user (including the header and page size alignment).
-// If BytesAllocated == 0 then this blob is spare, i.e.
-// was freed and remains cached for further possible reuse.
-//
-// When a new blob is being allocated, the allocator first tries to extract a spare blob. On success,
-// its acquired size is extended (if needed); the acquired size never shrinks on allocation.
-// If no spare blobs exist, a disposed segment is extracted and is turned into a blob (i.e.
-// its header is initialized) and the needed number of bytes is acquired. If no disposed segments
-// exist, then a new extent is allocated and sliced into segments.
-//
-// The above algorithm only claims memory from the system (by means of madvise(MADV_POPULATE));
-// the reclaim is handled by a separate background mechanism. Two types of reclaimable memory
-// regions are possible:
-// * spare: these correspond to spare blobs; upon reclaiming this region becomes a disposed segment
-// * overhead: these correspond to trailing parts of allocated blobs in [BytesAllocated, BytesAcquired) byte range
-//
-// Reclaiming spare blobs is easy as these are explicitly tracked by spare blob lists. To reclaim,
-// we atomically extract a blob from a spare list, call madvise(MADV_FREE), and put the pointer to
-// the disposed segment list.
-//
-// Reclaiming overheads is more complicated since (a) allocated blobs are never tracked directly and
-// (b) reclaiming them may interfere with Allocate and Free.
-//
-// To overcome (a), for each extent we maintain a bitmap marking segments that are actually blobs
-// (i.e. contain a header). (For simplicity and efficiency this bitmap is just a vector of bytes.)
-// These flags are updated in Allocate/Free with appropriate memory ordering. Note that
-// blobs are only disposed (and are turned into segments) by the background thread; if this
-// thread discovers a segment that is marked as a blob, then it is safe to assume that this segment
-// remains a blob unless the thread disposes it.
-//
-// To overcome (b), each large blob header maintains a spin lock. When blob B is extracted
-// from a spare list in Allocate, an acquisition is tried. If successful, B is returned to the
-// user. Otherwise it is assumed that B is currently being examined by the background
-// reclaimer thread. Allocate then skips this blob and retries extraction; the problem is that
-// since the spare list is basically a stack one cannot just push B back into the spare list.
-// Instead, B is pushed into a special locked spare list. This list is purged by the background
-// thread on each tick and its items are pushed back into the usual spare list.
-//
-// A similar trick is used by Free: when invoked for blob B its spin lock acquisition is first
-// tried. Upon success, B is moved to the spare list. On failure, Free has to postpone this deallocation
-// by moving B into the freed locked list. This list, similarly, is being purged by the background thread.
-//
-// It remains to explain how the background thread computes the number of bytes to be reclaimed from
-// each arena. To this aim, we first compute the total number of reclaimable bytes.
-// This is the sum of spare and overhead bytes in all arenas minus the number of unreclaimable bytes
-// The latter grows linearly in the number of used bytes and is capped from below by a MinUnreclaimableLargeBytes;
-// and from above by MaxUnreclaimableLargeBytes. SetLargeUnreclaimableCoeff and Set(Min|Max)LargeUnreclaimableBytes
-// enable tuning these control knobs. The reclaimable bytes are being taken from arenas starting from those
-// with the largest spare and overhead volumes.
-//
-// The above implies that each large blob contains a fixed-size header preceeding it.
-// Hence ptr % PageSize == sizeof (TLargeBlobHeader) for each ptr returned by Allocate
-// (since large blob sizes are larger than PageSize and are divisible by PageSize).
-// For AllocatePageAligned, however, ptr must be divisible by PageSize. To handle such an allocation, we
-// artificially increase its size and align the result of Allocate up to the next page boundary.
-// When handling a deallocation, ptr is moved back by UnalignPtr (which is capable of dealing
-// with both the results of Allocate and AllocatePageAligned).
-// This technique applies to both large and huge blobs.
-
-enum ELargeBlobState : ui64
-{
-    Allocated   = 0x6c6c61656772616cULL, // largeall
-    Spare       = 0x727073656772616cULL, // largespr
-    LockedSpare = 0x70736c656772616cULL, // largelsp
-    LockedFreed = 0x72666c656772616cULL  // largelfr
-};
-
-// Every large blob (either tagged or not) is prepended with this header.
-struct TLargeBlobHeader
-    : public TFreeListItemBase<TLargeBlobHeader>
-{
-    TLargeBlobHeader(
-        TLargeBlobExtent* extent,
-        size_t bytesAcquired,
-        size_t bytesAllocated,
-        TMemoryTag tag)
-        : Extent(extent)
-        , BytesAcquired(bytesAcquired)
-        , Tag(tag)
-        , BytesAllocated(bytesAllocated)
-        , State(ELargeBlobState::Allocated)
-    { }
-
-    TLargeBlobExtent* Extent;
-    // Number of bytes in all acquired pages.
-    size_t BytesAcquired;
-    std::atomic<bool> Locked = false;
-    TMemoryTag Tag = NullMemoryTag;
-    // For spare blobs this is zero.
-    // For allocated blobs this is the number of bytes requested by user (not including header of any alignment).
-    size_t BytesAllocated;
-    ELargeBlobState State;
-    char Padding[12];
-};
-
-CHECK_HEADER_ALIGNMENT(TLargeBlobHeader)
-
-struct TLargeBlobExtent
-{
-    TLargeBlobExtent(size_t segmentCount, char* ptr)
-        : SegmentCount(segmentCount)
-        , Ptr(ptr)
-    { }
-
-    size_t SegmentCount;
-    char* Ptr;
-    TLargeBlobExtent* NextExtent = nullptr;
-
-    std::atomic<bool> DisposedFlags[0];
-};
-
-// A helper node that enables storing a number of extent's segments
-// in a free list. Recall that segments themselves do not posses any headers.
-struct TDisposedSegment
-    : public TFreeListItemBase<TDisposedSegment>
-{
-    size_t Index;
-    TLargeBlobExtent* Extent;
-};
-
-struct TLargeArena
-{
-    size_t Rank = 0;
-    size_t SegmentSize = 0;
-
-    TShardedFreeList<TLargeBlobHeader> SpareBlobs;
-    TFreeList<TLargeBlobHeader> LockedSpareBlobs;
-    TFreeList<TLargeBlobHeader> LockedFreedBlobs;
-    TFreeList<TDisposedSegment> DisposedSegments;
-    std::atomic<TLargeBlobExtent*> FirstExtent = nullptr;
-
-    TLargeBlobExtent* CurrentOverheadScanExtent = nullptr;
-    size_t CurrentOverheadScanSegment = 0;
-};
-
-template <bool Dumpable>
-class TLargeBlobAllocator
-{
-public:
-    TLargeBlobAllocator()
-        : ZoneAllocator_(LargeZoneStart(Dumpable), LargeZoneEnd(Dumpable))
-    {
-        for (size_t rank = 0; rank < Arenas_.size(); ++rank) {
-            auto& arena = Arenas_[rank];
-            arena.Rank = rank;
-            arena.SegmentSize = (1ULL << rank);
-        }
-    }
-
-    void* Allocate(size_t size)
-    {
-        auto* state = TThreadManager::FindThreadState();
-        return Y_LIKELY(state)
-            ? DoAllocate(state, size)
-            : DoAllocate(GlobalState.Get(), size);
-    }
-
-    void Free(void* ptr)
-    {
-        auto* state = TThreadManager::FindThreadState();
-        if (Y_LIKELY(state)) {
-            DoFree(state, ptr);
-        } else {
-            DoFree(GlobalState.Get(), ptr);
-        }
-    }
-
-    static size_t GetAllocationSize(const void* ptr)
-    {
-        UnalignPtr<TLargeBlobHeader>(ptr);
-        const auto* blob = PtrToHeader<TLargeBlobHeader>(ptr);
-        return blob->BytesAllocated;
-    }
-
-    static size_t GetAllocationSize(size_t size)
-    {
-        return GetBlobAllocationSize<TLargeBlobHeader>(size);
-    }
-
-    void RunBackgroundTasks()
-    {
-        ReinstallLockedBlobs();
-        ReclaimMemory();
-    }
-
-    void SetBacktraceProvider(TBacktraceProvider provider)
-    {
-        BacktraceProvider_.store(provider);
-    }
-
-private:
-    template <class TState>
-    void PopulateArenaPages(TState* state, TLargeArena* arena, void* ptr, size_t size)
-    {
-        MappedMemoryManager->Populate(ptr, size);
-        StatisticsManager->IncrementLargeArenaCounter(state, arena->Rank, ELargeArenaCounter::BytesPopulated, size);
-        StatisticsManager->IncrementLargeArenaCounter(state, arena->Rank, ELargeArenaCounter::PagesPopulated, size / PageSize);
-        StatisticsManager->IncrementLargeArenaCounter(state, arena->Rank, ELargeArenaCounter::BytesCommitted, size);
-        StatisticsManager->IncrementLargeArenaCounter(state, arena->Rank, ELargeArenaCounter::PagesCommitted, size / PageSize);
-    }
-
-    template <class TState>
-    void ReleaseArenaPages(TState* state, TLargeArena* arena, void* ptr, size_t size)
-    {
-        MappedMemoryManager->Release(ptr, size);
-        StatisticsManager->IncrementLargeArenaCounter(state, arena->Rank, ELargeArenaCounter::BytesReleased, size);
-        StatisticsManager->IncrementLargeArenaCounter(state, arena->Rank, ELargeArenaCounter::PagesReleased, size / PageSize);
-        StatisticsManager->IncrementLargeArenaCounter(state, arena->Rank, ELargeArenaCounter::BytesCommitted, -size);
-        StatisticsManager->IncrementLargeArenaCounter(state, arena->Rank, ELargeArenaCounter::PagesCommitted, -size / PageSize);
-    }
-
-    bool TryLockBlob(TLargeBlobHeader* blob)
-    {
-        bool expected = false;
-        return blob->Locked.compare_exchange_strong(expected, true);
-    }
-
-    void UnlockBlob(TLargeBlobHeader* blob)
-    {
-        blob->Locked.store(false);
-    }
-
-    template <class TState>
-    void MoveBlobToSpare(TState* state, TLargeArena* arena, TLargeBlobHeader* blob, bool unlock)
-    {
-        auto rank = arena->Rank;
-        auto size = blob->BytesAllocated;
-        auto rawSize = GetRawBlobSize<TLargeBlobHeader>(size);
-        StatisticsManager->IncrementLargeArenaCounter(state, rank, ELargeArenaCounter::BytesSpare, blob->BytesAcquired);
-        StatisticsManager->IncrementLargeArenaCounter(state, rank, ELargeArenaCounter::BytesOverhead, -(blob->BytesAcquired - rawSize));
-        blob->BytesAllocated = 0;
-        if (unlock) {
-            UnlockBlob(blob);
-        } else {
-            YTALLOC_VERIFY(!blob->Locked.load());
-        }
-        blob->State = ELargeBlobState::Spare;
-        arena->SpareBlobs.Put(state, blob);
-    }
-
-    size_t GetBytesToReclaim(const std::array<TLocalLargeCounters, LargeRankCount>& arenaCounters)
-    {
-        size_t totalBytesAllocated = 0;
-        size_t totalBytesFreed = 0;
-        size_t totalBytesSpare = 0;
-        size_t totalBytesOverhead = 0;
-        for (size_t rank = 0; rank < Arenas_.size(); ++rank) {
-            const auto& counters = arenaCounters[rank];
-            totalBytesAllocated += counters[ELargeArenaCounter::BytesAllocated];
-            totalBytesFreed += counters[ELargeArenaCounter::BytesFreed];
-            totalBytesSpare += counters[ELargeArenaCounter::BytesSpare];
-            totalBytesOverhead += counters[ELargeArenaCounter::BytesOverhead];
-        }
-
-        auto totalBytesUsed = totalBytesAllocated - totalBytesFreed;
-        auto totalBytesReclaimable = totalBytesSpare + totalBytesOverhead;
-
-        auto threshold = ClampVal(
-            static_cast<size_t>(ConfigurationManager->GetLargeUnreclaimableCoeff() * totalBytesUsed),
-            ConfigurationManager->GetMinLargeUnreclaimableBytes(),
-            ConfigurationManager->GetMaxLargeUnreclaimableBytes());
-        if (totalBytesReclaimable < threshold) {
-            return 0;
-        }
-
-        auto bytesToReclaim = totalBytesReclaimable - threshold;
-        return AlignUp(bytesToReclaim, PageSize);
-    }
-
-    void ReinstallLockedSpareBlobs(TLargeArena* arena)
-    {
-        auto* blob = arena->LockedSpareBlobs.ExtractAll();
-        auto* state = TThreadManager::GetThreadStateChecked();
-
-        size_t count = 0;
-        while (blob) {
-            auto* nextBlob = blob->Next.load();
-            YTALLOC_VERIFY(!blob->Locked.load());
-            AssertBlobState(blob, ELargeBlobState::LockedSpare);
-            blob->State = ELargeBlobState::Spare;
-            arena->SpareBlobs.Put(state, blob);
-            blob = nextBlob;
-            ++count;
-        }
-
-        if (count > 0) {
-            YTALLOC_LOG_DEBUG("Locked spare blobs reinstalled (Rank: %d, Blobs: %zu)",
-                arena->Rank,
-                count);
-        }
-    }
-
-    void ReinstallLockedFreedBlobs(TLargeArena* arena)
-    {
-        auto* state = TThreadManager::GetThreadStateChecked();
-        auto* blob = arena->LockedFreedBlobs.ExtractAll();
-
-        size_t count = 0;
-        while (blob) {
-            auto* nextBlob = blob->Next.load();
-            AssertBlobState(blob, ELargeBlobState::LockedFreed);
-            MoveBlobToSpare(state, arena, blob, false);
-            ++count;
-            blob = nextBlob;
-        }
-
-        if (count > 0) {
-            YTALLOC_LOG_DEBUG("Locked freed blobs reinstalled (Rank: %d, Blobs: %zu)",
-                arena->Rank,
-                count);
-        }
-    }
-
-    void ReclaimSpareMemory(TLargeArena* arena, ssize_t bytesToReclaim)
-    {
-        if (bytesToReclaim <= 0) {
-            return;
-        }
-
-        auto rank = arena->Rank;
-        auto* state = TThreadManager::GetThreadStateChecked();
-
-        YTALLOC_LOG_DEBUG("Started processing spare memory in arena (BytesToReclaim: %zdM, Rank: %d)",
-            bytesToReclaim / 1_MB,
-            rank);
-
-        size_t bytesReclaimed = 0;
-        size_t blobsReclaimed = 0;
-        while (bytesToReclaim > 0) {
-            auto* blob = arena->SpareBlobs.ExtractRoundRobin(state);
-            if (!blob) {
-                break;
-            }
-
-            AssertBlobState(blob, ELargeBlobState::Spare);
-            YTALLOC_VERIFY(blob->BytesAllocated == 0);
-
-            auto bytesAcquired = blob->BytesAcquired;
-            StatisticsManager->IncrementLargeArenaCounter(state, rank, ELargeArenaCounter::BytesSpare, -bytesAcquired);
-            bytesToReclaim -= bytesAcquired;
-            bytesReclaimed += bytesAcquired;
-            blobsReclaimed += 1;
-
-            auto* extent = blob->Extent;
-            auto* ptr = reinterpret_cast<char*>(blob);
-            ReleaseArenaPages(
-                state,
-                arena,
-                ptr,
-                bytesAcquired);
-
-            size_t segmentIndex = (ptr - extent->Ptr) / arena->SegmentSize;
-            extent->DisposedFlags[segmentIndex].store(true, std::memory_order_relaxed);
-
-            auto* disposedSegment = DisposedSegmentPool_.Allocate();
-            disposedSegment->Index = segmentIndex;
-            disposedSegment->Extent = extent;
-            arena->DisposedSegments.Put(disposedSegment);
-        }
-
-        StatisticsManager->IncrementLargeArenaCounter(state, rank, ELargeArenaCounter::SpareBytesReclaimed, bytesReclaimed);
-
-        YTALLOC_LOG_DEBUG("Finished processing spare memory in arena (Rank: %d, BytesReclaimed: %zdM, BlobsReclaimed: %zu)",
-            arena->Rank,
-            bytesReclaimed / 1_MB,
-            blobsReclaimed);
-    }
-
-    void ReclaimOverheadMemory(TLargeArena* arena, ssize_t bytesToReclaim)
-    {
-        if (bytesToReclaim == 0) {
-            return;
-        }
-
-        auto* state = TThreadManager::GetThreadStateChecked();
-        auto rank = arena->Rank;
-
-        YTALLOC_LOG_DEBUG("Started processing overhead memory in arena (BytesToReclaim: %zdM, Rank: %d)",
-            bytesToReclaim / 1_MB,
-            rank);
-
-        size_t extentsTraversed = 0;
-        size_t segmentsTraversed = 0;
-        size_t bytesReclaimed = 0;
-
-        bool restartedFromFirstExtent = false;
-        auto& currentExtent = arena->CurrentOverheadScanExtent;
-        auto& currentSegment = arena->CurrentOverheadScanSegment;
-        while (bytesToReclaim > 0) {
-            if (!currentExtent) {
-                if (restartedFromFirstExtent) {
-                    break;
-                }
-                currentExtent = arena->FirstExtent.load();
-                if (!currentExtent) {
-                    break;
-                }
-                restartedFromFirstExtent = true;
-            }
-
-            while (currentSegment  < currentExtent->SegmentCount && bytesToReclaim > 0) {
-                ++segmentsTraversed;
-                if (!currentExtent->DisposedFlags[currentSegment].load(std::memory_order_acquire)) {
-                    auto* ptr = currentExtent->Ptr + currentSegment * arena->SegmentSize;
-                    auto* blob = reinterpret_cast<TLargeBlobHeader*>(ptr);
-                    YTALLOC_PARANOID_ASSERT(blob->Extent == currentExtent);
-                    if (TryLockBlob(blob)) {
-                        if (blob->BytesAllocated > 0) {
-                            size_t rawSize = GetRawBlobSize<TLargeBlobHeader>(blob->BytesAllocated);
-                            size_t bytesToRelease = blob->BytesAcquired - rawSize;
-                            if (bytesToRelease > 0) {
-                                ReleaseArenaPages(
-                                    state,
-                                    arena,
-                                    ptr + blob->BytesAcquired - bytesToRelease,
-                                    bytesToRelease);
-                                StatisticsManager->IncrementLargeArenaCounter(state, rank, ELargeArenaCounter::BytesOverhead, -bytesToRelease);
-                                blob->BytesAcquired = rawSize;
-                                bytesToReclaim -= bytesToRelease;
-                                bytesReclaimed += bytesToRelease;
-                            }
-                        }
-                        UnlockBlob(blob);
-                    }
-                }
-                ++currentSegment;
-            }
-
-            ++extentsTraversed;
-            currentSegment = 0;
-            currentExtent = currentExtent->NextExtent;
-        }
-
-        StatisticsManager->IncrementLargeArenaCounter(state, rank, ELargeArenaCounter::OverheadBytesReclaimed, bytesReclaimed);
-
-        YTALLOC_LOG_DEBUG("Finished processing overhead memory in arena (Rank: %d, Extents: %zu, Segments: %zu, BytesReclaimed: %zuM)",
-            arena->Rank,
-            extentsTraversed,
-            segmentsTraversed,
-            bytesReclaimed / 1_MB);
-    }
-
-    void ReinstallLockedBlobs()
-    {
-        for (auto& arena : Arenas_) {
-            ReinstallLockedSpareBlobs(&arena);
-            ReinstallLockedFreedBlobs(&arena);
-        }
-    }
-
-    void ReclaimMemory()
-    {
-        auto arenaCounters = StatisticsManager->GetLargeArenaAllocationCounters();
-        ssize_t bytesToReclaim = GetBytesToReclaim(arenaCounters);
-        if (bytesToReclaim == 0) {
-            return;
-        }
-
-        YTALLOC_LOG_DEBUG("Memory reclaim started (BytesToReclaim: %zdM)",
-            bytesToReclaim / 1_MB);
-
-        std::array<ssize_t, LargeRankCount * 2> bytesReclaimablePerArena;
-        for (size_t rank = 0; rank < LargeRankCount; ++rank) {
-            bytesReclaimablePerArena[rank * 2] = arenaCounters[rank][ELargeArenaCounter::BytesOverhead];
-            bytesReclaimablePerArena[rank * 2 + 1] = arenaCounters[rank][ELargeArenaCounter::BytesSpare];
-        }
-
-        std::array<ssize_t, LargeRankCount * 2> bytesToReclaimPerArena{};
-        while (bytesToReclaim > 0) {
-            ssize_t maxBytes = std::numeric_limits<ssize_t>::min();
-            int maxIndex = -1;
-            for (int index = 0; index < LargeRankCount * 2; ++index) {
-                if (bytesReclaimablePerArena[index] > maxBytes) {
-                    maxBytes = bytesReclaimablePerArena[index];
-                    maxIndex = index;
-                }
-            }
-
-            if (maxIndex < 0) {
-                break;
-            }
-
-            auto bytesToReclaimPerStep = std::min<ssize_t>({bytesToReclaim, maxBytes, 4_MB});
-            if (bytesToReclaimPerStep < 0) {
-                break;
-            }
-
-            bytesToReclaimPerArena[maxIndex] += bytesToReclaimPerStep;
-            bytesReclaimablePerArena[maxIndex] -= bytesToReclaimPerStep;
-            bytesToReclaim -= bytesToReclaimPerStep;
-        }
-
-        for (auto& arena : Arenas_) {
-            auto rank = arena.Rank;
-            ReclaimOverheadMemory(&arena, bytesToReclaimPerArena[rank * 2]);
-            ReclaimSpareMemory(&arena, bytesToReclaimPerArena[rank * 2 + 1]);
-        }
-
-        YTALLOC_LOG_DEBUG("Memory reclaim finished");
-    }
-
-    template <class TState>
-    void AllocateArenaExtent(TState* state, TLargeArena* arena)
-    {
-        auto rank = arena->Rank;
-        StatisticsManager->IncrementLargeArenaCounter(state, rank, ELargeArenaCounter::ExtentsAllocated, 1);
-
-        size_t segmentCount = LargeExtentSize / arena->SegmentSize;
-        size_t extentHeaderSize = AlignUp(sizeof (TLargeBlobExtent) + sizeof (TLargeBlobExtent::DisposedFlags[0]) * segmentCount, PageSize);
-        size_t allocationSize = extentHeaderSize + LargeExtentSize;
-
-        auto* ptr = ZoneAllocator_.Allocate(allocationSize, MAP_NORESERVE);
-        if (!Dumpable) {
-            MappedMemoryManager->DontDump(ptr, allocationSize);
-        }
-
-        if (auto backtraceProvider = BacktraceProvider_.load()) {
-            std::array<void*, MaxAllocationProfilingBacktraceDepth> frames;
-            auto frameCount = backtraceProvider(
-                frames.data(),
-                MaxAllocationProfilingBacktraceDepth,
-                3);
-            MmapObservationManager->EnqueueEvent(allocationSize, frames, frameCount);
-        }
-
-        StatisticsManager->IncrementLargeArenaCounter(state, rank, ELargeArenaCounter::BytesMapped, allocationSize);
-        StatisticsManager->IncrementLargeArenaCounter(state, rank, ELargeArenaCounter::PagesMapped, allocationSize / PageSize);
-
-        auto* extent = static_cast<TLargeBlobExtent*>(ptr);
-        MappedMemoryManager->Populate(ptr, extentHeaderSize);
-        StatisticsManager->IncrementLargeArenaCounter(state, rank, ELargeArenaCounter::BytesPopulated, extentHeaderSize);
-        StatisticsManager->IncrementLargeArenaCounter(state, rank, ELargeArenaCounter::PagesPopulated, extentHeaderSize / PageSize);
-        StatisticsManager->IncrementSystemCounter(ESystemCounter::BytesAllocated, extentHeaderSize);
-
-        new (extent) TLargeBlobExtent(segmentCount, static_cast<char*>(ptr) + extentHeaderSize);
-
-        for (size_t index = 0; index < segmentCount; ++index) {
-            auto* disposedSegment = DisposedSegmentPool_.Allocate();
-            disposedSegment->Index = index;
-            disposedSegment->Extent = extent;
-            arena->DisposedSegments.Put(disposedSegment);
-            extent->DisposedFlags[index].store(true);
-        }
-
-        auto* expectedFirstExtent = arena->FirstExtent.load();
-        do {
-            extent->NextExtent = expectedFirstExtent;
-        } while (Y_UNLIKELY(!arena->FirstExtent.compare_exchange_weak(expectedFirstExtent, extent)));
-    }
-
-    template <class TState>
-    void* DoAllocate(TState* state, size_t size)
-    {
-        auto rawSize = GetRawBlobSize<TLargeBlobHeader>(size);
-        auto rank = GetLargeRank(rawSize);
-        auto tag = ConfigurationManager->IsLargeArenaAllocationProfiled(rank)
-            ? BacktraceManager->GetMemoryTagFromBacktrace(3)
-            : TThreadManager::GetCurrentMemoryTag();
-        auto& arena = Arenas_[rank];
-        YTALLOC_PARANOID_ASSERT(rawSize <= arena.SegmentSize);
-
-        TLargeBlobHeader* blob;
-        while (true) {
-            blob = arena.SpareBlobs.Extract(state);
-            if (blob) {
-                AssertBlobState(blob, ELargeBlobState::Spare);
-                if (TryLockBlob(blob)) {
-                    StatisticsManager->IncrementLargeArenaCounter(state, rank, ELargeArenaCounter::BytesSpare, -blob->BytesAcquired);
-                    if (blob->BytesAcquired < rawSize) {
-                        PopulateArenaPages(
-                            state,
-                            &arena,
-                            reinterpret_cast<char*>(blob) + blob->BytesAcquired,
-                            rawSize - blob->BytesAcquired);
-                        blob->BytesAcquired = rawSize;
-                    } else {
-                        StatisticsManager->IncrementLargeArenaCounter(state, rank, ELargeArenaCounter::BytesOverhead, blob->BytesAcquired - rawSize);
-                    }
-                    YTALLOC_PARANOID_ASSERT(blob->BytesAllocated == 0);
-                    blob->BytesAllocated = size;
-                    blob->Tag = tag;
-                    blob->State = ELargeBlobState::Allocated;
-                    UnlockBlob(blob);
-                    break;
-                } else {
-                    blob->State = ELargeBlobState::LockedSpare;
-                    arena.LockedSpareBlobs.Put(blob);
-                }
-            }
-
-            auto* disposedSegment = arena.DisposedSegments.Extract();
-            if (disposedSegment) {
-                auto index = disposedSegment->Index;
-                auto* extent = disposedSegment->Extent;
-                DisposedSegmentPool_.Free(disposedSegment);
-
-                auto* ptr = extent->Ptr + index * arena.SegmentSize;
-                PopulateArenaPages(
-                    state,
-                    &arena,
-                    ptr,
-                    rawSize);
-
-                blob = reinterpret_cast<TLargeBlobHeader*>(ptr);
-                new (blob) TLargeBlobHeader(extent, rawSize, size, tag);
-
-                extent->DisposedFlags[index].store(false, std::memory_order_release);
-
-                break;
-            }
-
-            AllocateArenaExtent(state, &arena);
-        }
-
-        StatisticsManager->IncrementLargeArenaCounter(state, rank, ELargeArenaCounter::BlobsAllocated, 1);
-        StatisticsManager->IncrementLargeArenaCounter(state, rank, ELargeArenaCounter::BytesAllocated, size);
-        StatisticsManager->IncrementTotalCounter(state, tag, EBasicCounter::BytesAllocated, size);
-        if (!Dumpable) {
-            StatisticsManager->IncrementUndumpableCounter(state, EUndumpableCounter::BytesAllocated, size);
-        }
-
-        auto* result = HeaderToPtr(blob);
-        YTALLOC_PARANOID_ASSERT(reinterpret_cast<uintptr_t>(result) >= LargeZoneStart(Dumpable) && reinterpret_cast<uintptr_t>(result) < LargeZoneEnd(Dumpable));
-        PoisonUninitializedRange(result, size);
-        return result;
-    }
-
-    template <class TState>
-    void DoFree(TState* state, void* ptr)
-    {
-        YTALLOC_PARANOID_ASSERT(reinterpret_cast<uintptr_t>(ptr) >= LargeZoneStart(Dumpable) && reinterpret_cast<uintptr_t>(ptr) < LargeZoneEnd(Dumpable));
-
-        auto* blob = PtrToHeader<TLargeBlobHeader>(ptr);
-        AssertBlobState(blob, ELargeBlobState::Allocated);
-
-        auto size = blob->BytesAllocated;
-        PoisonFreedRange(ptr, size);
-
-        auto rawSize = GetRawBlobSize<TLargeBlobHeader>(size);
-        auto rank = GetLargeRank(rawSize);
-        auto& arena = Arenas_[rank];
-        YTALLOC_PARANOID_ASSERT(blob->BytesAcquired <= arena.SegmentSize);
-
-        auto tag = blob->Tag;
-
-        StatisticsManager->IncrementLargeArenaCounter(state, rank, ELargeArenaCounter::BlobsFreed, 1);
-        StatisticsManager->IncrementLargeArenaCounter(state, rank, ELargeArenaCounter::BytesFreed, size);
-        StatisticsManager->IncrementTotalCounter(state, tag, EBasicCounter::BytesFreed, size);
-        if (!Dumpable) {
-            StatisticsManager->IncrementUndumpableCounter(state, EUndumpableCounter::BytesFreed, size);
-        }
-
-        if (TryLockBlob(blob)) {
-            MoveBlobToSpare(state, &arena, blob, true);
-        } else {
-            blob->State = ELargeBlobState::LockedFreed;
-            arena.LockedFreedBlobs.Put(blob);
-        }
-    }
-
-private:
-    TZoneAllocator ZoneAllocator_;
-    std::array<TLargeArena, LargeRankCount> Arenas_;
-
-    static constexpr size_t DisposedSegmentsBatchSize = 1024;
-    TSystemPool<TDisposedSegment, DisposedSegmentsBatchSize> DisposedSegmentPool_;
-
-    std::atomic<TBacktraceProvider> BacktraceProvider_ = nullptr;
-};
-
-TExplicitlyConstructableSingleton<TLargeBlobAllocator<true>> DumpableLargeBlobAllocator;
-TExplicitlyConstructableSingleton<TLargeBlobAllocator<false>> UndumpableLargeBlobAllocator;
-
-////////////////////////////////////////////////////////////////////////////////
-// Huge blob allocator
-//
-// Basically a wrapper for TZoneAllocator.
-
-// Acts as a signature to detect broken headers.
-enum class EHugeBlobState : ui64
-{
-    Allocated = 0x72666c656772616cULL // hugeallc
-};
-
-// Every huge blob (both tagged or not) is prepended with this header.
-struct THugeBlobHeader
-{
-    THugeBlobHeader(TMemoryTag tag, size_t size, bool dumpable)
-        : Tag(tag)
-        , Size(size)
-        , State(EHugeBlobState::Allocated)
-        , Dumpable(dumpable)
-    { }
-
-    TMemoryTag Tag;
-    size_t Size;
-    EHugeBlobState State;
-    bool Dumpable;
-    char Padding[7];
-};
-
-CHECK_HEADER_ALIGNMENT(THugeBlobHeader)
-
-class THugeBlobAllocator
-{
-public:
-    THugeBlobAllocator()
-        : ZoneAllocator_(HugeZoneStart, HugeZoneEnd)
-    { }
-
-    void* Allocate(size_t size, bool dumpable)
-    {
-        YTALLOC_VERIFY(size <= MaxAllocationSize);
-        auto tag = TThreadManager::GetCurrentMemoryTag();
-        auto rawSize = GetRawBlobSize<THugeBlobHeader>(size);
-        auto* blob = static_cast<THugeBlobHeader*>(ZoneAllocator_.Allocate(rawSize, MAP_POPULATE));
-        if (!dumpable) {
-            MappedMemoryManager->DontDump(blob, rawSize);
-        }
-        new (blob) THugeBlobHeader(tag, size, dumpable);
-
-        StatisticsManager->IncrementTotalCounter(tag, EBasicCounter::BytesAllocated, size);
-        StatisticsManager->IncrementHugeCounter(EHugeCounter::BlobsAllocated, 1);
-        StatisticsManager->IncrementHugeCounter(EHugeCounter::BytesAllocated, size);
-        if (!dumpable) {
-            StatisticsManager->IncrementHugeUndumpableCounter(EUndumpableCounter::BytesAllocated, size);
-        }
-
-        auto* result = HeaderToPtr(blob);
-        PoisonUninitializedRange(result, size);
-        return result;
-    }
-
-    void Free(void* ptr)
-    {
-        auto* blob = PtrToHeader<THugeBlobHeader>(ptr);
-        AssertBlobState(blob, EHugeBlobState::Allocated);
-        auto tag = blob->Tag;
-        auto size = blob->Size;
-        auto dumpable = blob->Dumpable;
-        PoisonFreedRange(ptr, size);
-
-        auto rawSize = GetRawBlobSize<THugeBlobHeader>(size);
-        ZoneAllocator_.Free(blob, rawSize);
-
-        StatisticsManager->IncrementTotalCounter(tag, EBasicCounter::BytesFreed, size);
-        StatisticsManager->IncrementHugeCounter(EHugeCounter::BlobsFreed, 1);
-        StatisticsManager->IncrementHugeCounter(EHugeCounter::BytesFreed, size);
-        if (!dumpable) {
-            StatisticsManager->IncrementHugeUndumpableCounter(EUndumpableCounter::BytesFreed, size);
-        }
-    }
-
-    static size_t GetAllocationSize(const void* ptr)
-    {
-        UnalignPtr<THugeBlobHeader>(ptr);
-        const auto* blob = PtrToHeader<THugeBlobHeader>(ptr);
-        return blob->Size;
-    }
-
-    static size_t GetAllocationSize(size_t size)
-    {
-        return GetBlobAllocationSize<THugeBlobHeader>(size);
-    }
-
-private:
-    TZoneAllocator ZoneAllocator_;
-};
-
-TExplicitlyConstructableSingleton<THugeBlobAllocator> HugeBlobAllocator;
-
-////////////////////////////////////////////////////////////////////////////////
-// A thunk to large and huge blob allocators
-
-class TBlobAllocator
-{
-public:
-    static void* Allocate(size_t size)
-    {
-        InitializeGlobals();
-        bool dumpable = GetCurrentMemoryZone() != EMemoryZone::Undumpable;
-        // NB: Account for the header. Also note that we may safely ignore the alignment since
-        // HugeAllocationSizeThreshold is already page-aligned.
-        if (Y_LIKELY(size < HugeAllocationSizeThreshold - sizeof(TLargeBlobHeader) - RightReadableAreaSize)) {
-            void* result = dumpable
-                ? DumpableLargeBlobAllocator->Allocate(size)
-                : UndumpableLargeBlobAllocator->Allocate(size);
-            YTALLOC_PARANOID_ASSERT(reinterpret_cast<uintptr_t>(result) >= LargeZoneStart(dumpable) && reinterpret_cast<uintptr_t>(result) < LargeZoneEnd(dumpable));
-            return result;
-        } else {
-            auto* result = HugeBlobAllocator->Allocate(size, dumpable);
-            YTALLOC_PARANOID_ASSERT(reinterpret_cast<uintptr_t>(result) >= HugeZoneStart && reinterpret_cast<uintptr_t>(result) < HugeZoneEnd);
-            return result;
-        }
-    }
-
-    static void Free(void* ptr)
-    {
-        InitializeGlobals();
-        if (reinterpret_cast<uintptr_t>(ptr) < LargeZoneEnd(true)) {
-            YTALLOC_PARANOID_ASSERT(reinterpret_cast<uintptr_t>(ptr) >= LargeZoneStart(true) && reinterpret_cast<uintptr_t>(ptr) < LargeZoneEnd(true));
-            UnalignPtr<TLargeBlobHeader>(ptr);
-            DumpableLargeBlobAllocator->Free(ptr);
-        } else if (reinterpret_cast<uintptr_t>(ptr) < LargeZoneEnd(false)) {
-            YTALLOC_PARANOID_ASSERT(reinterpret_cast<uintptr_t>(ptr) >= LargeZoneStart(false) && reinterpret_cast<uintptr_t>(ptr) < LargeZoneEnd(false));
-            UnalignPtr<TLargeBlobHeader>(ptr);
-            UndumpableLargeBlobAllocator->Free(ptr);
-        } else if (reinterpret_cast<uintptr_t>(ptr) < HugeZoneEnd) {
-            YTALLOC_PARANOID_ASSERT(reinterpret_cast<uintptr_t>(ptr) >= HugeZoneStart && reinterpret_cast<uintptr_t>(ptr) < HugeZoneEnd);
-            UnalignPtr<THugeBlobHeader>(ptr);
-            HugeBlobAllocator->Free(ptr);
-        } else {
-            YTALLOC_TRAP("Wrong ptr passed to Free");
-        }
-    }
-};
-
-////////////////////////////////////////////////////////////////////////////////
-
-Y_POD_THREAD(bool) CurrentThreadIsBackground;
-
-// Base class for all background threads.
-template <class T>
-class TBackgroundThreadBase
-{
-public:
-    TBackgroundThreadBase()
-        : State_(new TState())
-    {
-        NThreading::RegisterAtForkHandlers(
-            [=] { BeforeFork(); },
-            [=] { AfterForkParent(); },
-            [=] { AfterForkChild(); });
-    }
-
-    virtual ~TBackgroundThreadBase()
-    {
-        Stop();
-    }
-
-private:
-    struct TState
-        : public TSystemAllocatable
-    {
-        std::mutex StartStopMutex;
-        std::optional<std::thread> Thread;
-
-        std::mutex StopFlagMutex;
-        std::condition_variable StopFlagVariable;
-        std::chrono::system_clock::time_point LastInvocationTime;
-        bool StopFlag = false;
-        bool Paused = false;
-
-        std::atomic<int> ForkDepth = 0;
-        bool RestartAfterFork = false;
-    };
-
-    TState* State_;
-
-private:
-    void BeforeFork()
-    {
-        bool stopped = Stop();
-        if (State_->ForkDepth++ == 0) {
-            State_->RestartAfterFork = stopped;
-        }
-    }
-
-    void AfterForkParent()
-    {
-        if (--State_->ForkDepth == 0) {
-            if (State_->RestartAfterFork) {
-                Start(false);
-            }
-        }
-    }
-
-    void AfterForkChild()
-    {
-        bool restart = State_->RestartAfterFork;
-        State_ = new TState();
-        if (restart) {
-            Start(false);
-        }
-    }
-
-    virtual void ThreadMain() = 0;
-
-protected:
-    void Start(bool fromAlloc)
-    {
-        std::unique_lock<std::mutex> guard(State_->StartStopMutex, std::defer_lock);
-        if (fromAlloc) {
-            if (!guard.try_lock()) {
-                return;
-            }
-
-            if (State_->Paused) {
-                return;
-            }
-        } else {
-            guard.lock();
-        }
-
-        State_->Paused = false;
-        if (State_->Thread) {
-            return;
-        }
-
-        State_->StopFlag = false;
-
-        State_->Thread.emplace([=] {
-            CurrentThreadIsBackground = true;
-            ThreadMain();
-        });
-
-        OnStart();
-    }
-
-    bool Stop()
-    {
-        std::unique_lock<std::mutex> guard(State_->StartStopMutex);
-
-        State_->Paused = true;
-        if (!State_->Thread) {
-            return false;
-        }
-
-        std::unique_lock<std::mutex> flagGuard(State_->StopFlagMutex);
-        State_->StopFlag = true;
-        flagGuard.unlock();
-        State_->StopFlagVariable.notify_one();
-
-        State_->Thread->join();
-        State_->Thread.reset();
-
-        OnStop();
-
-        return true;
-    }
-
-    bool IsDone(TDuration interval)
-    {
-        std::unique_lock<std::mutex> flagGuard(State_->StopFlagMutex);
-        auto result = State_->StopFlagVariable.wait_until(
-            flagGuard,
-            State_->LastInvocationTime + std::chrono::microseconds(interval.MicroSeconds()),
-            [&] { return State_->StopFlag; });
-        State_->LastInvocationTime = std::chrono::system_clock::now();
-        return result;
-    }
-
-    virtual void OnStart()
-    { }
-
-    virtual void OnStop()
-    { }
-};
-
-////////////////////////////////////////////////////////////////////////////////
-
-// Invokes madvise(MADV_STOCKPILE) periodically.
-class TStockpileThread
-    : public TBackgroundThreadBase<TStockpileThread>
-{
-public:
-    explicit TStockpileThread(int index)
-        : Index_(index)
-    {
-        Start(false);
-    }
-
-private:
-    const int Index_;
-
-    virtual void ThreadMain() override
-    {
-        TThread::SetCurrentThreadName(Sprintf("%s:%d", StockpileThreadName, Index_).c_str());
-
-        while (!IsDone(ConfigurationManager->GetStockpileInterval())) {
-            if (!MappedMemoryManager->Stockpile(ConfigurationManager->GetStockpileSize())) {
-                // No use to proceed.
-                YTALLOC_LOG_INFO("Stockpile call failed; terminating stockpile thread");
-                break;
-            }
-        }
-    }
-};
-
-// Manages a bunch of TStockpileThreads.
-class TStockpileManager
-{
-public:
-    void SpawnIfNeeded()
-    {
-        if (!ConfigurationManager->IsStockpileEnabled()) {
-            return;
-        }
-
-        int threadCount = ConfigurationManager->GetStockpileThreadCount();
-        while (static_cast<int>(Threads_.size()) > threadCount) {
-            Threads_.pop_back();
-        }
-        while (static_cast<int>(Threads_.size()) < threadCount) {
-            Threads_.push_back(std::make_unique<TStockpileThread>(static_cast<int>(Threads_.size())));
-        }
-    }
-
-private:
-    std::vector<std::unique_ptr<TStockpileThread>> Threads_;
-};
-
-TExplicitlyConstructableSingleton<TStockpileManager> StockpileManager;
-
-////////////////////////////////////////////////////////////////////////////////
-
-// Time to wait before re-spawning the thread after a fork.
-static constexpr auto BackgroundThreadRespawnDelay = TDuration::Seconds(3);
-
-// Runs basic background activities: reclaim, logging, profiling etc.
-class TBackgroundThread
-    : public TBackgroundThreadBase<TBackgroundThread>
-{
-public:
-    bool IsStarted()
-    {
-        return Started_.load();
-    }
-
-    void SpawnIfNeeded()
-    {
-        if (CurrentThreadIsBackground) {
-            return;
-        }
-        Start(true);
-    }
-
-private:
-    std::atomic<bool> Started_ = false;
-
-private:
-    virtual void ThreadMain() override
-    {
-        TThread::SetCurrentThreadName(BackgroundThreadName);
-        TimingManager->DisableForCurrentThread();
-        MmapObservationManager->DisableForCurrentThread();
-
-        while (!IsDone(BackgroundInterval)) {
-            DumpableLargeBlobAllocator->RunBackgroundTasks();
-            UndumpableLargeBlobAllocator->RunBackgroundTasks();
-            MappedMemoryManager->RunBackgroundTasks();
-            TimingManager->RunBackgroundTasks();
-            MmapObservationManager->RunBackgroundTasks();
-            StockpileManager->SpawnIfNeeded();
-        }
-    }
-
-    virtual void OnStart() override
-    {
-        DoUpdateAllThreadsControlWord(true);
-    }
-
-    virtual void OnStop() override
-    {
-        DoUpdateAllThreadsControlWord(false);
-    }
-
-    void DoUpdateAllThreadsControlWord(bool started)
-    {
-        // Update threads' TLS.
-        ThreadManager->EnumerateThreadStatesSync(
-            [&] {
-                Started_.store(started);
-            },
-            [&] (auto* state) {
-                if (state->BackgroundThreadStarted) {
-                    *state->BackgroundThreadStarted = started;
-                }
-            });
-    }
-};
-
-TExplicitlyConstructableSingleton<TBackgroundThread> BackgroundThread;
-
-////////////////////////////////////////////////////////////////////////////////
-
-Y_FORCE_INLINE TThreadState* TThreadManager::GetThreadStateUnchecked()
-{
-    YTALLOC_PARANOID_ASSERT(ThreadState_);
-    return ThreadState_;
-}
-
-Y_FORCE_INLINE TThreadState* TThreadManager::FindThreadState()
-{
-    if (Y_LIKELY(ThreadState_)) {
-        return ThreadState_;
-    }
-
-    if (ThreadStateDestroyed_) {
-        return nullptr;
-    }
-
-    InitializeGlobals();
-
-    // InitializeGlobals must not allocate.
-    Y_ABORT_UNLESS(!ThreadState_);
-    ThreadState_ = ThreadManager->AllocateThreadState();
-    (&ThreadControlWord_)->Parts.ThreadStateValid = true;
-
-    return ThreadState_;
-}
-
-void TThreadManager::DestroyThread(void*)
-{
-    TSmallAllocator::PurgeCaches();
-
-    TThreadState* state = ThreadState_;
-    ThreadState_ = nullptr;
-    ThreadStateDestroyed_ = true;
-    (&ThreadControlWord_)->Parts.ThreadStateValid = false;
-
-    {
-        auto guard = GuardWithTiming(ThreadManager->ThreadRegistryLock_);
-        state->AllocationProfilingEnabled = nullptr;
-        state->BackgroundThreadStarted = nullptr;
-        ThreadManager->UnrefThreadState(state);
-    }
-}
-
-void TThreadManager::DestroyThreadState(TThreadState* state)
-{
-    StatisticsManager->AccumulateLocalCounters(state);
-    ThreadRegistry_.Remove(state);
-    ThreadStatePool_.Free(state);
-}
-
-void TThreadManager::AfterFork()
-{
-    auto guard = GuardWithTiming(ThreadRegistryLock_);
-    ThreadRegistry_.Clear();
-    TThreadState* state = ThreadState_;
-    if (state) {
-        ThreadRegistry_.PushBack(state);
-    }
-}
-
-TThreadState* TThreadManager::AllocateThreadState()
-{
-    auto* state = ThreadStatePool_.Allocate();
-    state->AllocationProfilingEnabled = &(*&ThreadControlWord_).Parts.AllocationProfilingEnabled;
-    state->BackgroundThreadStarted = &(*&ThreadControlWord_).Parts.BackgroundThreadStarted;
-
-    {
-        auto guard = GuardWithTiming(ThreadRegistryLock_);
-        // NB: These flags must be initialized under ThreadRegistryLock_; see EnumerateThreadStatesSync.
-        *state->AllocationProfilingEnabled = ConfigurationManager->IsAllocationProfilingEnabled();
-        *state->BackgroundThreadStarted = BackgroundThread->IsStarted();
-        ThreadRegistry_.PushBack(state);
-    }
-
-    // Need to pass some non-null value for DestroyThread to be called.
-    pthread_setspecific(ThreadDtorKey_, (void*)-1);
-
-    return state;
-}
-
-////////////////////////////////////////////////////////////////////////////////
-
-void InitializeGlobals()
-{
-    static std::once_flag Initialized;
-    std::call_once(Initialized, [] () {
-        LogManager.Construct();
-        BacktraceManager.Construct();
-        StatisticsManager.Construct();
-        MappedMemoryManager.Construct();
-        ThreadManager.Construct();
-        GlobalState.Construct();
-        DumpableLargeBlobAllocator.Construct();
-        UndumpableLargeBlobAllocator.Construct();
-        HugeBlobAllocator.Construct();
-        ConfigurationManager.Construct();
-        SystemAllocator.Construct();
-        TimingManager.Construct();
-        MmapObservationManager.Construct();
-        StockpileManager.Construct();
-        BackgroundThread.Construct();
-
-        SmallArenaAllocators.Construct();
-        auto constructSmallArenaAllocators = [&] (EAllocationKind kind, uintptr_t zonesStart) {
-            for (size_t rank = 1; rank < SmallRankCount; ++rank) {
-                (*SmallArenaAllocators)[kind][rank].Construct(kind, rank, zonesStart + rank * SmallZoneSize);
-            }
-        };
-        constructSmallArenaAllocators(EAllocationKind::Untagged, UntaggedSmallZonesStart);
-        constructSmallArenaAllocators(EAllocationKind::Tagged, TaggedSmallZonesStart);
-
-        GlobalSmallChunkCaches.Construct();
-        (*GlobalSmallChunkCaches)[EAllocationKind::Tagged].Construct(EAllocationKind::Tagged);
-        (*GlobalSmallChunkCaches)[EAllocationKind::Untagged].Construct(EAllocationKind::Untagged);
-    });
-}
-
-////////////////////////////////////////////////////////////////////////////////
-
-void StartBackgroundThread()
-{
-    InitializeGlobals();
-    BackgroundThread->SpawnIfNeeded();
-}
-
-////////////////////////////////////////////////////////////////////////////////
-
-template <class... Ts>
-Y_FORCE_INLINE void* AllocateSmallUntagged(size_t rank, Ts... args)
-{
-    auto* result = TSmallAllocator::Allocate<EAllocationKind::Untagged>(NullMemoryTag, rank, std::forward<Ts>(args)...);
-    YTALLOC_PARANOID_ASSERT(reinterpret_cast<uintptr_t>(result) >= MinUntaggedSmallPtr && reinterpret_cast<uintptr_t>(result) < MaxUntaggedSmallPtr);
-    return result;
-}
-
-template <class... Ts>
-Y_FORCE_INLINE void* AllocateSmallTagged(ui64 controlWord, size_t rank, Ts... args)
-{
-    auto tag = Y_UNLIKELY((controlWord & TThreadManager::AllocationProfilingEnabledControlWordMask) && ConfigurationManager->IsSmallArenaAllocationProfiled(rank))
-        ? BacktraceManager->GetMemoryTagFromBacktrace(2)
-        : static_cast<TMemoryTag>(controlWord & TThreadManager::MemoryTagControlWordMask);
-    auto* result = TSmallAllocator::Allocate<EAllocationKind::Tagged>(tag, rank, std::forward<Ts>(args)...);
-    YTALLOC_PARANOID_ASSERT(reinterpret_cast<uintptr_t>(result) >= MinTaggedSmallPtr && reinterpret_cast<uintptr_t>(result) < MaxTaggedSmallPtr);
-    return result;
-}
-
-Y_FORCE_INLINE void* AllocateInline(size_t size)
-{
-    size_t rank;
-    if (Y_LIKELY(size <= 512)) {
-        rank = SizeToSmallRank1[(size + 7) >> 3];
-    } else if (Y_LIKELY(size < LargeAllocationSizeThreshold)) {
-        rank = SizeToSmallRank2[(size - 1) >> 8];
-    } else {
-        StartBackgroundThread();
-        return TBlobAllocator::Allocate(size);
-    }
-
-    auto controlWord = TThreadManager::GetThreadControlWord();
-    if (Y_LIKELY(controlWord == TThreadManager::FastPathControlWord)) {
-        return AllocateSmallUntagged(rank, TThreadManager::GetThreadStateUnchecked());
-    }
-
-    if (Y_UNLIKELY(!(controlWord & TThreadManager::BackgroundThreadStartedControlWorkMask))) {
-        StartBackgroundThread();
-    }
-
-    if (!(controlWord & (TThreadManager::MemoryTagControlWordMask | TThreadManager::AllocationProfilingEnabledControlWordMask))) {
-        return AllocateSmallUntagged(rank);
-    } else {
-        return AllocateSmallTagged(controlWord, rank);
-    }
-}
-
-Y_FORCE_INLINE void* AllocateSmallInline(size_t rank)
-{
-    auto controlWord = TThreadManager::GetThreadControlWord();
-    if (Y_LIKELY(controlWord == TThreadManager::FastPathControlWord)) {
-        return AllocateSmallUntagged(rank, TThreadManager::GetThreadStateUnchecked());
-    }
-
-    if (!(controlWord & (TThreadManager::MemoryTagControlWordMask | TThreadManager::AllocationProfilingEnabledControlWordMask))) {
-        return AllocateSmallUntagged(rank);
-    } else {
-        return AllocateSmallTagged(controlWord, rank);
-    }
-}
-
-Y_FORCE_INLINE void* AllocatePageAlignedInline(size_t size)
-{
-    size = std::max(AlignUp(size, PageSize), PageSize);
-    void* result = size >= LargeAllocationSizeThreshold
-        ? AlignUp(TBlobAllocator::Allocate(size + PageSize), PageSize)
-        : Allocate(size);
-    YTALLOC_ASSERT(reinterpret_cast<uintptr_t>(result) % PageSize == 0);
-    return result;
-}
-
-Y_FORCE_INLINE void FreeNonNullInline(void* ptr)
-{
-    YTALLOC_ASSERT(ptr);
-    if (Y_LIKELY(reinterpret_cast<uintptr_t>(ptr) < UntaggedSmallZonesEnd)) {
-        YTALLOC_PARANOID_ASSERT(reinterpret_cast<uintptr_t>(ptr) >= MinUntaggedSmallPtr && reinterpret_cast<uintptr_t>(ptr) < MaxUntaggedSmallPtr);
-        TSmallAllocator::Free<EAllocationKind::Untagged>(ptr);
-    } else if (Y_LIKELY(reinterpret_cast<uintptr_t>(ptr) < TaggedSmallZonesEnd)) {
-        YTALLOC_PARANOID_ASSERT(reinterpret_cast<uintptr_t>(ptr) >= MinTaggedSmallPtr && reinterpret_cast<uintptr_t>(ptr) < MaxTaggedSmallPtr);
-        TSmallAllocator::Free<EAllocationKind::Tagged>(ptr);
-    } else {
-        TBlobAllocator::Free(ptr);
-    }
-}
-
-Y_FORCE_INLINE void FreeInline(void* ptr)
-{
-    if (Y_LIKELY(ptr)) {
-        FreeNonNullInline(ptr);
-    }
-}
-
-Y_FORCE_INLINE size_t GetAllocationSizeInline(const void* ptr)
-{
-    if (Y_UNLIKELY(!ptr)) {
-        return 0;
-    }
-
-    auto uintptr = reinterpret_cast<uintptr_t>(ptr);
-    if (uintptr < UntaggedSmallZonesEnd) {
-        YTALLOC_PARANOID_ASSERT(uintptr >= MinUntaggedSmallPtr && uintptr < MaxUntaggedSmallPtr);
-        return TSmallAllocator::GetAllocationSize(ptr);
-    } else if (uintptr < TaggedSmallZonesEnd) {
-        YTALLOC_PARANOID_ASSERT(uintptr >= MinTaggedSmallPtr && uintptr < MaxTaggedSmallPtr);
-        return TSmallAllocator::GetAllocationSize(ptr);
-    } else if (uintptr < LargeZoneEnd(true)) {
-        YTALLOC_PARANOID_ASSERT(uintptr >= LargeZoneStart(true) && uintptr < LargeZoneEnd(true));
-        return TLargeBlobAllocator<true>::GetAllocationSize(ptr);
-    } else if (uintptr < LargeZoneEnd(false)) {
-        YTALLOC_PARANOID_ASSERT(uintptr >= LargeZoneStart(false) && uintptr < LargeZoneEnd(false));
-        return TLargeBlobAllocator<false>::GetAllocationSize(ptr);
-    } else if (uintptr < HugeZoneEnd) {
-        YTALLOC_PARANOID_ASSERT(uintptr >= HugeZoneStart && uintptr < HugeZoneEnd);
-        return THugeBlobAllocator::GetAllocationSize(ptr);
-    } else {
-        YTALLOC_TRAP("Wrong ptr passed to GetAllocationSizeInline");
-    }
-}
-
-Y_FORCE_INLINE size_t GetAllocationSizeInline(size_t size)
-{
-    if (size <= LargeAllocationSizeThreshold) {
-        return TSmallAllocator::GetAllocationSize(size);
-    } else if (size <= HugeAllocationSizeThreshold) {
-        return TLargeBlobAllocator<true>::GetAllocationSize(size);
-    } else {
-        return THugeBlobAllocator::GetAllocationSize(size);
-    }
-}
-
-void EnableLogging(TLogHandler logHandler)
-{
-    InitializeGlobals();
-    LogManager->EnableLogging(logHandler);
-}
-
-void SetBacktraceProvider(TBacktraceProvider provider)
-{
-    InitializeGlobals();
-    BacktraceManager->SetBacktraceProvider(provider);
-    DumpableLargeBlobAllocator->SetBacktraceProvider(provider);
-    UndumpableLargeBlobAllocator->SetBacktraceProvider(provider);
-}
-
-void SetBacktraceFormatter(TBacktraceFormatter provider)
-{
-    InitializeGlobals();
-    MmapObservationManager->SetBacktraceFormatter(provider);
-}
-
-void EnableStockpile()
-{
-    InitializeGlobals();
-    ConfigurationManager->EnableStockpile();
-}
-
-void SetStockpileInterval(TDuration value)
-{
-    InitializeGlobals();
-    ConfigurationManager->SetStockpileInterval(value);
-}
-
-void SetStockpileThreadCount(int value)
-{
-    InitializeGlobals();
-    ConfigurationManager->SetStockpileThreadCount(value);
-}
-
-void SetStockpileSize(size_t value)
-{
-    InitializeGlobals();
-    ConfigurationManager->SetStockpileSize(value);
-}
-
-void SetLargeUnreclaimableCoeff(double value)
-{
-    InitializeGlobals();
-    ConfigurationManager->SetLargeUnreclaimableCoeff(value);
-}
-
-void SetTimingEventThreshold(TDuration value)
-{
-    InitializeGlobals();
-    ConfigurationManager->SetTimingEventThreshold(value);
-}
-
-void SetMinLargeUnreclaimableBytes(size_t value)
-{
-    InitializeGlobals();
-    ConfigurationManager->SetMinLargeUnreclaimableBytes(value);
-}
-
-void SetMaxLargeUnreclaimableBytes(size_t value)
-{
-    InitializeGlobals();
-    ConfigurationManager->SetMaxLargeUnreclaimableBytes(value);
-}
-
-void SetAllocationProfilingEnabled(bool value)
-{
-    ConfigurationManager->SetAllocationProfilingEnabled(value);
-}
-
-void SetAllocationProfilingSamplingRate(double rate)
-{
-    ConfigurationManager->SetAllocationProfilingSamplingRate(rate);
-}
-
-void SetSmallArenaAllocationProfilingEnabled(size_t rank, bool value)
-{
-    ConfigurationManager->SetSmallArenaAllocationProfilingEnabled(rank, value);
-}
-
-void SetLargeArenaAllocationProfilingEnabled(size_t rank, bool value)
-{
-    ConfigurationManager->SetLargeArenaAllocationProfilingEnabled(rank, value);
-}
-
-void SetProfilingBacktraceDepth(int depth)
-{
-    ConfigurationManager->SetProfilingBacktraceDepth(depth);
-}
-
-void SetMinProfilingBytesUsedToReport(size_t size)
-{
-    ConfigurationManager->SetMinProfilingBytesUsedToReport(size);
-}
-
-void SetEnableEagerMemoryRelease(bool value)
-{
-    ConfigurationManager->SetEnableEagerMemoryRelease(value);
-}
-
-void SetEnableMadvisePopulate(bool value)
-{
-    ConfigurationManager->SetEnableMadvisePopulate(value);
-}
-
-TEnumIndexedArray<ETotalCounter, ssize_t> GetTotalAllocationCounters()
-{
-    InitializeGlobals();
-    return StatisticsManager->GetTotalAllocationCounters();
-}
-
-TEnumIndexedArray<ESystemCounter, ssize_t> GetSystemAllocationCounters()
-{
-    InitializeGlobals();
-    return StatisticsManager->GetSystemAllocationCounters();
-}
-
-TEnumIndexedArray<ESystemCounter, ssize_t> GetUndumpableAllocationCounters()
-{
-    InitializeGlobals();
-    return StatisticsManager->GetUndumpableAllocationCounters();
-}
-
-TEnumIndexedArray<ESmallCounter, ssize_t> GetSmallAllocationCounters()
-{
-    InitializeGlobals();
-    return StatisticsManager->GetSmallAllocationCounters();
-}
-
-TEnumIndexedArray<ESmallCounter, ssize_t> GetLargeAllocationCounters()
-{
-    InitializeGlobals();
-    return StatisticsManager->GetLargeAllocationCounters();
-}
-
-std::array<TEnumIndexedArray<ESmallArenaCounter, ssize_t>, SmallRankCount> GetSmallArenaAllocationCounters()
-{
-    InitializeGlobals();
-    return StatisticsManager->GetSmallArenaAllocationCounters();
-}
-
-std::array<TEnumIndexedArray<ELargeArenaCounter, ssize_t>, LargeRankCount> GetLargeArenaAllocationCounters()
-{
-    InitializeGlobals();
-    return StatisticsManager->GetLargeArenaAllocationCounters();
-}
-
-TEnumIndexedArray<EHugeCounter, ssize_t> GetHugeAllocationCounters()
-{
-    InitializeGlobals();
-    return StatisticsManager->GetHugeAllocationCounters();
-}
-
-std::vector<TProfiledAllocation> GetProfiledAllocationStatistics()
-{
-    InitializeGlobals();
-
-    if (!ConfigurationManager->IsAllocationProfilingEnabled()) {
-        return {};
-    }
-
-    std::vector<TMemoryTag> tags;
-    tags.reserve(MaxCapturedAllocationBacktraces + 1);
-    for (TMemoryTag tag = AllocationProfilingMemoryTagBase;
-        tag < AllocationProfilingMemoryTagBase + MaxCapturedAllocationBacktraces;
-        ++tag)
-    {
-        tags.push_back(tag);
-    }
-    tags.push_back(AllocationProfilingUnknownMemoryTag);
-
-    std::vector<TEnumIndexedArray<EBasicCounter, ssize_t>> counters;
-    counters.resize(tags.size());
-    StatisticsManager->GetTaggedMemoryCounters(tags.data(), tags.size(), counters.data());
-
-    std::vector<TProfiledAllocation> statistics;
-    for (size_t index = 0; index < tags.size(); ++index) {
-        if (counters[index][EBasicCounter::BytesUsed] < static_cast<ssize_t>(ConfigurationManager->GetMinProfilingBytesUsedToReport())) {
-            continue;
-        }
-        auto tag = tags[index];
-        auto optionalBacktrace = BacktraceManager->FindBacktrace(tag);
-        if (!optionalBacktrace && tag != AllocationProfilingUnknownMemoryTag) {
-            continue;
-        }
-        statistics.push_back(TProfiledAllocation{
-            optionalBacktrace.value_or(TBacktrace()),
-            counters[index]
-        });
-    }
-    return statistics;
-}
-
-TEnumIndexedArray<ETimingEventType, TTimingEventCounters> GetTimingEventCounters()
-{
-    InitializeGlobals();
-    return TimingManager->GetTimingEventCounters();
-}
-
-////////////////////////////////////////////////////////////////////////////////
-
-} // namespace NYT::NYTAlloc
diff --git a/library/cpp/ytalloc/impl/ya.make b/library/cpp/ytalloc/impl/ya.make
deleted file mode 100644
index 23b6d5874edb..000000000000
--- a/library/cpp/ytalloc/impl/ya.make
+++ /dev/null
@@ -1,15 +0,0 @@
-LIBRARY()
-
-ALLOCATOR_IMPL()
-SRCS(
-    bridge.cpp
-)
-
-PEERDIR(
-    library/cpp/malloc/api
-    library/cpp/yt/containers
-    library/cpp/yt/memory
-    library/cpp/yt/threading
-)
-
-END()
diff --git a/yt/yt/client/table_client/unittests/ya.make b/yt/yt/client/table_client/unittests/ya.make
index f7594f2b9aaf..a91bc4b66f6d 100644
--- a/yt/yt/client/table_client/unittests/ya.make
+++ b/yt/yt/client/table_client/unittests/ya.make
@@ -2,8 +2,6 @@ GTEST(unittester-client-table-client)
 
 INCLUDE(${ARCADIA_ROOT}/yt/ya_cpp.make.inc)
 
-ALLOCATOR(YT)
-
 SRCS(
     columnar_statistics_ut.cpp
     columnar_ut.cpp
diff --git a/yt/yt/client/unittests/ya.make b/yt/yt/client/unittests/ya.make
index f5111fca971f..a71a98db2cd0 100644
--- a/yt/yt/client/unittests/ya.make
+++ b/yt/yt/client/unittests/ya.make
@@ -2,8 +2,6 @@ GTEST(unittester-client)
 
 INCLUDE(${ARCADIA_ROOT}/yt/ya_cpp.make.inc)
 
-ALLOCATOR(YT)
-
 PROTO_NAMESPACE(yt)
 
 SRCS(
diff --git a/yt/yt/core/actions/unittests/ya.make b/yt/yt/core/actions/unittests/ya.make
index b1cf89738830..10bbe6d32ff8 100644
--- a/yt/yt/core/actions/unittests/ya.make
+++ b/yt/yt/core/actions/unittests/ya.make
@@ -2,10 +2,6 @@ GTEST(unittester-core-actions)
 
 INCLUDE(${ARCADIA_ROOT}/yt/ya_cpp.make.inc)
 
-IF (NOT OS_WINDOWS AND NOT ARCH_AARCH64)
-    ALLOCATOR(YT)
-ENDIF()
-
 PROTO_NAMESPACE(yt)
 
 SRCS(
diff --git a/yt/yt/core/bus/unittests/ya.make b/yt/yt/core/bus/unittests/ya.make
index 7f56a36940f9..23ff0194554a 100644
--- a/yt/yt/core/bus/unittests/ya.make
+++ b/yt/yt/core/bus/unittests/ya.make
@@ -2,10 +2,6 @@ GTEST(unittester-core-bus)
 
 INCLUDE(${ARCADIA_ROOT}/yt/ya_cpp.make.inc)
 
-IF (NOT OS_WINDOWS AND NOT ARCH_AARCH64)
-    ALLOCATOR(YT)
-ENDIF()
-
 PROTO_NAMESPACE(yt)
 
 SRCS(
diff --git a/yt/yt/core/compression/unittests/ya.make b/yt/yt/core/compression/unittests/ya.make
index ec506bdc6f89..6f7d0a63a1d7 100644
--- a/yt/yt/core/compression/unittests/ya.make
+++ b/yt/yt/core/compression/unittests/ya.make
@@ -2,10 +2,6 @@ GTEST(unittester-core-compression)
 
 INCLUDE(${ARCADIA_ROOT}/yt/ya_cpp.make.inc)
 
-IF (NOT OS_WINDOWS AND NOT ARCH_AARCH64)
-    ALLOCATOR(YT)
-ENDIF()
-
 PROTO_NAMESPACE(yt)
 
 SRCS(
diff --git a/yt/yt/core/concurrency/unittests/ya.make b/yt/yt/core/concurrency/unittests/ya.make
index b8b94dcfc950..61f7b416ad02 100644
--- a/yt/yt/core/concurrency/unittests/ya.make
+++ b/yt/yt/core/concurrency/unittests/ya.make
@@ -2,10 +2,6 @@ GTEST(unittester-core-concurrency)
 
 INCLUDE(${ARCADIA_ROOT}/yt/ya_cpp.make.inc)
 
-IF (NOT OS_WINDOWS AND NOT ARCH_AARCH64)
-    ALLOCATOR(YT)
-ENDIF()
-
 PROTO_NAMESPACE(yt)
 
 SRCS(
diff --git a/yt/yt/core/crypto/unittests/ya.make b/yt/yt/core/crypto/unittests/ya.make
index 460de9e95753..d46f2a4729de 100644
--- a/yt/yt/core/crypto/unittests/ya.make
+++ b/yt/yt/core/crypto/unittests/ya.make
@@ -2,10 +2,6 @@ GTEST(unittester-core-crypto)
 
 INCLUDE(${ARCADIA_ROOT}/yt/ya_cpp.make.inc)
 
-IF (NOT OS_WINDOWS AND NOT ARCH_AARCH64)
-    ALLOCATOR(YT)
-ENDIF()
-
 PROTO_NAMESPACE(yt)
 
 SRCS(
diff --git a/yt/yt/core/http/unittests/ya.make b/yt/yt/core/http/unittests/ya.make
index c3724d1234e5..622590287c5f 100644
--- a/yt/yt/core/http/unittests/ya.make
+++ b/yt/yt/core/http/unittests/ya.make
@@ -2,10 +2,6 @@ GTEST(unittester-core-http)
 
 INCLUDE(${ARCADIA_ROOT}/yt/ya_cpp.make.inc)
 
-IF (NOT OS_WINDOWS AND NOT ARCH_AARCH64)
-    ALLOCATOR(YT)
-ENDIF()
-
 PROTO_NAMESPACE(yt)
 
 SRCS(
diff --git a/yt/yt/core/json/unittests/ya.make b/yt/yt/core/json/unittests/ya.make
index 8dc2d207d118..19c3198a5f77 100644
--- a/yt/yt/core/json/unittests/ya.make
+++ b/yt/yt/core/json/unittests/ya.make
@@ -2,10 +2,6 @@ GTEST(unittester-core-json)
 
 INCLUDE(${ARCADIA_ROOT}/yt/ya_cpp.make.inc)
 
-IF (NOT OS_WINDOWS AND NOT ARCH_AARCH64)
-    ALLOCATOR(YT)
-ENDIF()
-
 PROTO_NAMESPACE(yt)
 
 SRCS(
diff --git a/yt/yt/core/logging/unittests/ya.make b/yt/yt/core/logging/unittests/ya.make
index 571cfee61e32..3f5d7833de0c 100644
--- a/yt/yt/core/logging/unittests/ya.make
+++ b/yt/yt/core/logging/unittests/ya.make
@@ -2,10 +2,6 @@ GTEST(unittester-core-logging)
 
 INCLUDE(${ARCADIA_ROOT}/yt/ya_cpp.make.inc)
 
-IF (NOT OS_WINDOWS AND NOT ARCH_AARCH64)
-    ALLOCATOR(YT)
-ENDIF()
-
 PROTO_NAMESPACE(yt)
 
 SRCS(
diff --git a/yt/yt/core/misc/unittests/ya.make b/yt/yt/core/misc/unittests/ya.make
index 3d8571039181..62fb8ece1357 100644
--- a/yt/yt/core/misc/unittests/ya.make
+++ b/yt/yt/core/misc/unittests/ya.make
@@ -2,10 +2,6 @@ GTEST(unittester-core-misc)
 
 INCLUDE(${ARCADIA_ROOT}/yt/ya_cpp.make.inc)
 
-IF (NOT OS_WINDOWS AND NOT ARCH_AARCH64)
-    ALLOCATOR(YT)
-ENDIF()
-
 PROTO_NAMESPACE(yt)
 
 SRCS(
diff --git a/yt/yt/core/net/unittests/ya.make b/yt/yt/core/net/unittests/ya.make
index d1bf83296881..9976ff1c1869 100644
--- a/yt/yt/core/net/unittests/ya.make
+++ b/yt/yt/core/net/unittests/ya.make
@@ -2,10 +2,6 @@ GTEST(unittester-core-net)
 
 INCLUDE(${ARCADIA_ROOT}/yt/ya_cpp.make.inc)
 
-IF (NOT OS_WINDOWS AND NOT ARCH_AARCH64)
-    ALLOCATOR(YT)
-ENDIF()
-
 PROTO_NAMESPACE(yt)
 
 SRCS(
diff --git a/yt/yt/core/profiling/unittests/ya.make b/yt/yt/core/profiling/unittests/ya.make
index b31b812b83c9..8b9cc6c8de60 100644
--- a/yt/yt/core/profiling/unittests/ya.make
+++ b/yt/yt/core/profiling/unittests/ya.make
@@ -2,10 +2,6 @@ GTEST(unittester-core-profiling)
 
 INCLUDE(${ARCADIA_ROOT}/yt/ya_cpp.make.inc)
 
-IF (NOT OS_WINDOWS AND NOT ARCH_AARCH64)
-    ALLOCATOR(YT)
-ENDIF()
-
 PROTO_NAMESPACE(yt)
 
 SRCS(
diff --git a/yt/yt/core/rpc/unittests/main/ya.make b/yt/yt/core/rpc/unittests/main/ya.make
index 773edcce9651..37b494945c5b 100644
--- a/yt/yt/core/rpc/unittests/main/ya.make
+++ b/yt/yt/core/rpc/unittests/main/ya.make
@@ -2,10 +2,6 @@ GTEST(unittester-core-rpc)
 
 INCLUDE(${ARCADIA_ROOT}/yt/ya_cpp.make.inc)
 
-IF (NOT OS_WINDOWS AND NOT ARCH_AARCH64)
-    ALLOCATOR(YT)
-ENDIF()
-
 PROTO_NAMESPACE(yt)
 
 SRCS(
diff --git a/yt/yt/core/rpc/unittests/rpc_ut.cpp b/yt/yt/core/rpc/unittests/rpc_ut.cpp
index d83c1b7d6bbc..c887795911f7 100644
--- a/yt/yt/core/rpc/unittests/rpc_ut.cpp
+++ b/yt/yt/core/rpc/unittests/rpc_ut.cpp
@@ -620,39 +620,6 @@ TYPED_TEST(TNotGrpcTest, Compression)
     }
 }
 
-#if !defined(_asan_enabled_) && !defined(_msan_enabled_) && defined(_linux_)
-
-TYPED_TEST(TRpcTest, ResponseMemoryTag)
-{
-    static TMemoryTag testMemoryTag = 12345;
-    testMemoryTag++;
-    auto initialMemoryUsage = GetMemoryUsageForTag(testMemoryTag);
-
-    std::vector<TTestProxy::TRspPassCallPtr> rsps;
-    {
-        TTestProxy proxy(this->CreateChannel());
-        TString userName("user");
-
-        TMemoryTagGuard guard(testMemoryTag);
-
-        for (int i = 0; i < 1000; ++i) {
-            auto req = proxy.PassCall();
-            req->SetUser(userName);
-            req->SetMutationId(TGuid::Create());
-            req->SetRetry(false);
-            auto err = req->Invoke().Get();
-            rsps.push_back(err.ValueOrThrow());
-        }
-    }
-
-    auto currentMemoryUsage = GetMemoryUsageForTag(testMemoryTag);
-    EXPECT_GE(currentMemoryUsage - initialMemoryUsage, 200_KB)
-        << "InitialUsage: " << initialMemoryUsage << std::endl
-        << "Current: " << currentMemoryUsage;
-}
-
-#endif
-
 TYPED_TEST(TNotGrpcTest, RequestBytesThrottling)
 {
     auto configText = TString(R"({
diff --git a/yt/yt/core/rpc/unittests/shutdown/ya.make b/yt/yt/core/rpc/unittests/shutdown/ya.make
index a55c4d4fc02f..ffc34b2bd06c 100644
--- a/yt/yt/core/rpc/unittests/shutdown/ya.make
+++ b/yt/yt/core/rpc/unittests/shutdown/ya.make
@@ -2,10 +2,6 @@ GTEST(unittester-core-rpc-shutdown)
 
 INCLUDE(${ARCADIA_ROOT}/yt/ya_cpp.make.inc)
 
-IF (NOT OS_WINDOWS AND NOT ARCH_AARCH64)
-    ALLOCATOR(YT)
-ENDIF()
-
 PROTO_NAMESPACE(yt)
 
 SRCS(
diff --git a/yt/yt/core/ya.make b/yt/yt/core/ya.make
index 59362bed77aa..56334cc6c093 100644
--- a/yt/yt/core/ya.make
+++ b/yt/yt/core/ya.make
@@ -390,7 +390,7 @@ RECURSE(
     test_framework
 )
 
-IF (NOT OPENSOURCE)
+IF (NOT OPENSOURCE AND OS_LINUX)
     RECURSE(
         benchmarks
         bus/benchmarks
diff --git a/yt/yt/core/ypath/unittests/ya.make b/yt/yt/core/ypath/unittests/ya.make
index fa6d821da449..b4f2240c8c60 100644
--- a/yt/yt/core/ypath/unittests/ya.make
+++ b/yt/yt/core/ypath/unittests/ya.make
@@ -2,10 +2,6 @@ GTEST(unittester-core-ypath)
 
 INCLUDE(${ARCADIA_ROOT}/yt/ya_cpp.make.inc)
 
-IF (NOT OS_WINDOWS AND NOT ARCH_AARCH64)
-    ALLOCATOR(YT)
-ENDIF()
-
 PROTO_NAMESPACE(yt)
 
 SRCS(
diff --git a/yt/yt/core/yson/unittests/ya.make b/yt/yt/core/yson/unittests/ya.make
index a153d12ce7ab..59a807fca427 100644
--- a/yt/yt/core/yson/unittests/ya.make
+++ b/yt/yt/core/yson/unittests/ya.make
@@ -2,10 +2,6 @@ GTEST(unittester-core-yson)
 
 INCLUDE(${ARCADIA_ROOT}/yt/ya_cpp.make.inc)
 
-IF (NOT OS_WINDOWS AND NOT ARCH_AARCH64)
-    ALLOCATOR(YT)
-ENDIF()
-
 PROTO_NAMESPACE(yt)
 
 SRCS(
diff --git a/yt/yt/core/ytree/unittests/ya.make b/yt/yt/core/ytree/unittests/ya.make
index 034ba0a1d5d7..7196cea98cef 100644
--- a/yt/yt/core/ytree/unittests/ya.make
+++ b/yt/yt/core/ytree/unittests/ya.make
@@ -2,10 +2,6 @@ GTEST(unittester-core-ytree)
 
 INCLUDE(${ARCADIA_ROOT}/yt/ya_cpp.make.inc)
 
-IF (NOT OS_WINDOWS AND NOT ARCH_AARCH64)
-    ALLOCATOR(YT)
-ENDIF()
-
 PROTO_NAMESPACE(yt)
 
 SRCS(
diff --git a/yt/yt/library/auth/unittests/ya.make b/yt/yt/library/auth/unittests/ya.make
index c45504d10ac9..e72e5132581f 100644
--- a/yt/yt/library/auth/unittests/ya.make
+++ b/yt/yt/library/auth/unittests/ya.make
@@ -2,8 +2,6 @@ GTEST(unittester-library-auth)
 
 INCLUDE(${ARCADIA_ROOT}/yt/ya_cpp.make.inc)
 
-ALLOCATOR(YT)
-
 SRCS(
     auth_ut.cpp
 )
diff --git a/yt/yt/library/decimal/unittests/ya.make b/yt/yt/library/decimal/unittests/ya.make
index 76341ee7b23a..172760632dae 100644
--- a/yt/yt/library/decimal/unittests/ya.make
+++ b/yt/yt/library/decimal/unittests/ya.make
@@ -2,8 +2,6 @@ GTEST(unittester-library-decimal)
 
 INCLUDE(${ARCADIA_ROOT}/yt/ya_cpp.make.inc)
 
-ALLOCATOR(YT)
-
 SRCS(
     decimal_ut.cpp
 )
diff --git a/yt/yt/library/erasure/impl/unittests/ya.make b/yt/yt/library/erasure/impl/unittests/ya.make
index 15b3f6fb8d9f..d2b30ea1404d 100644
--- a/yt/yt/library/erasure/impl/unittests/ya.make
+++ b/yt/yt/library/erasure/impl/unittests/ya.make
@@ -2,8 +2,6 @@ GTEST(unittester-library-erasure)
 
 INCLUDE(${ARCADIA_ROOT}/yt/ya_cpp.make.inc)
 
-ALLOCATOR(YT)
-
 SRCS(
     erasure_stability_ut.cpp
 )
diff --git a/yt/yt/library/process/unittests/ya.make b/yt/yt/library/process/unittests/ya.make
index 7c5d0cb48f4a..149d9eee1faf 100644
--- a/yt/yt/library/process/unittests/ya.make
+++ b/yt/yt/library/process/unittests/ya.make
@@ -2,8 +2,6 @@ GTEST(unittester-library-process)
 
 INCLUDE(${ARCADIA_ROOT}/yt/ya_cpp.make.inc)
 
-ALLOCATOR(YT)
-
 SRCS(
     pipes_ut.cpp
     process_ut.cpp
diff --git a/yt/yt/library/tvm/service/unittests/ya.make b/yt/yt/library/tvm/service/unittests/ya.make
index 28629e6d8267..23ac522bd06f 100644
--- a/yt/yt/library/tvm/service/unittests/ya.make
+++ b/yt/yt/library/tvm/service/unittests/ya.make
@@ -1,7 +1,5 @@
 GTEST(unittester-library-auth_tvm)
 
-ALLOCATOR(YT)
-
 INCLUDE(${ARCADIA_ROOT}/yt/opensource.inc)
 
 PEERDIR(

From f3531eaf37b153f0f3829cf63c82a1928d4d9122 Mon Sep 17 00:00:00 2001
From: babenko <babenko@yandex-team.com>
Date: Fri, 29 Nov 2024 18:15:35 +0300
Subject: [PATCH 04/16] Fix typo
 commit_hash:01730f5887c8bccb0deb1393bfd15f867792ce5f

---
 yt/yt/core/concurrency/nonblocking_batcher.h | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/yt/yt/core/concurrency/nonblocking_batcher.h b/yt/yt/core/concurrency/nonblocking_batcher.h
index 5681f27b135f..841faf590ce5 100644
--- a/yt/yt/core/concurrency/nonblocking_batcher.h
+++ b/yt/yt/core/concurrency/nonblocking_batcher.h
@@ -8,7 +8,7 @@ namespace NYT::NConcurrency {
 
 ////////////////////////////////////////////////////////////////////////////////
 
-DEFINE_ENUM(ETNonblockingBatcherTimerState,
+DEFINE_ENUM(ENonblockingBatcherTimerState,
     (Initial)
     (Started)
     (Finished)
@@ -97,7 +97,7 @@ class TNonblockingBatcher
     std::vector<TBatch> Drain();
 
 private:
-    using ETimerState = ETNonblockingBatcherTimerState;
+    using ETimerState = ENonblockingBatcherTimerState;
 
     TBatchLimiter BatchLimiter_;
     TDuration BatchDuration_;

From c67888be3cb9ffde249bcc0ec11b1a2cde58f60b Mon Sep 17 00:00:00 2001
From: vvvv <vvvv@yandex-team.com>
Date: Fri, 29 Nov 2024 18:37:36 +0300
Subject: [PATCH 05/16] Avoid mutation of the main module resolver

init
commit_hash:9451f73b18e0feb7c201f456a180d8967a270b0d
---
 yql/essentials/public/purecalc/common/worker_factory.cpp | 3 ++-
 1 file changed, 2 insertions(+), 1 deletion(-)

diff --git a/yql/essentials/public/purecalc/common/worker_factory.cpp b/yql/essentials/public/purecalc/common/worker_factory.cpp
index 173f73b7beb2..27ac0acda8ba 100644
--- a/yql/essentials/public/purecalc/common/worker_factory.cpp
+++ b/yql/essentials/public/purecalc/common/worker_factory.cpp
@@ -130,7 +130,7 @@ template <typename TBase>
 TExprNode::TPtr TWorkerFactory<TBase>::Compile(
     TStringBuf query,
     ETranslationMode mode,
-    IModuleResolver::TPtr moduleResolver,
+    IModuleResolver::TPtr factoryModuleResolver,
     ui16 syntaxVersion,
     const THashMap<TString, TString>& modules,
     const TInputSpecBase& inputSpec,
@@ -145,6 +145,7 @@ TExprNode::TPtr TWorkerFactory<TBase>::Compile(
 
     TTypeAnnotationContextPtr typeContext;
 
+    IModuleResolver::TPtr moduleResolver = factoryModuleResolver ? factoryModuleResolver->CreateMutableChild() : nullptr;
     typeContext = MakeIntrusive<TTypeAnnotationContext>();
     typeContext->RandomProvider = CreateDefaultRandomProvider();
     typeContext->TimeProvider = DeterministicTimeProviderSeed_ ?

From 0924e1c53b7aec2c5efefe89499154b0a7e902f7 Mon Sep 17 00:00:00 2001
From: morozov1one <morozov1one@yandex-team.com>
Date: Fri, 29 Nov 2024 20:28:24 +0300
Subject: [PATCH 06/16] Upgrade mimalloc to 1.8.7
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

Ниже описал существенные изменения в поведении, которые я заметил (в сравнении с версией 1.7.2, которая лежит в контрибах сейчас)

Полный changelog можно посмотреть в [readme.md](http://readme.md)

* Поменялся дефолт у [опции](https://github.com/microsoft/mimalloc/blob/9cae0d31cd28476664dbaa6e4e6940b9d900842a/src/options.c#L109), определяющей то, как неиспользуемая память возвращается в систему. В старых версиях по умолчанию использовался madvise с флагом MADV_FREE, в свежих версиях же используется MADV_DONTNEED. Это может вызвать неожиданные изменения (в худшую сторону) на графиках потребляемой анонимной памяти (), хотя по факту потребление должно быть \+- одинаковым

* Алгоритм работы аллокатора претерпел некоторые изменения. Например, мы споткнулись об то, что в новой версии mimalloc выделяет себе 1Gb (размер задается [опцией](https://github.com/microsoft/mimalloc/blob/2765ec93026f445cad8f38e6b196dd226a1f6e61/src/options.c#L87)) памяти при первой же аллокации. Само по себе это мало на что влияет, но неприятности могут случиться, если звать в начале программы mlockall
commit_hash:dc6d945c1776c874e554f94b705c4e446b0a11d8
---
 .../.yandex_meta/devtools.copyrights.report   |   77 +-
 .../.yandex_meta/devtools.licenses.report     |   47 +-
 .../mimalloc/.yandex_meta/licenses.list.txt   |   27 +-
 contrib/libs/mimalloc/SECURITY.md             |   41 +
 .../mimalloc/include/mimalloc-new-delete.h    |   26 +-
 .../libs/mimalloc/include/mimalloc-override.h |    4 +-
 contrib/libs/mimalloc/include/mimalloc.h      |  222 ++-
 .../{mimalloc-atomic.h => mimalloc/atomic.h}  |   91 +-
 .../internal.h}                               |  553 ++++----
 contrib/libs/mimalloc/include/mimalloc/prim.h |  373 +++++
 .../libs/mimalloc/include/mimalloc/track.h    |  149 ++
 .../{mimalloc-types.h => mimalloc/types.h}    |  297 +++-
 contrib/libs/mimalloc/readme.md               |  405 ++++--
 contrib/libs/mimalloc/src/alloc-aligned.c     |  272 ++--
 .../libs/mimalloc/src/alloc-override-osx.c    |  281 ----
 contrib/libs/mimalloc/src/alloc-override.c    |  234 ++-
 contrib/libs/mimalloc/src/alloc-posix.c       |   72 +-
 contrib/libs/mimalloc/src/alloc.c             |  883 ++++--------
 contrib/libs/mimalloc/src/arena.c             | 1039 +++++++++++---
 contrib/libs/mimalloc/src/bitmap.c            |  213 +--
 contrib/libs/mimalloc/src/bitmap.h            |   20 +-
 contrib/libs/mimalloc/src/free.c              |  520 +++++++
 contrib/libs/mimalloc/src/heap.c              |  227 ++-
 contrib/libs/mimalloc/src/init.c              |  449 +++---
 contrib/libs/mimalloc/src/libc.c              |  273 ++++
 contrib/libs/mimalloc/src/options.c           |  362 +++--
 contrib/libs/mimalloc/src/os.c                | 1256 +++++------------
 contrib/libs/mimalloc/src/page-queue.c        |   68 +-
 contrib/libs/mimalloc/src/page.c              |  287 ++--
 .../src/prim/osx/alloc-override-zone.c        |  461 ++++++
 contrib/libs/mimalloc/src/prim/osx/prim.c     |    9 +
 contrib/libs/mimalloc/src/prim/prim.c         |   27 +
 contrib/libs/mimalloc/src/prim/unix/prim.c    |  882 ++++++++++++
 contrib/libs/mimalloc/src/random.c            |  147 +-
 contrib/libs/mimalloc/src/region.c            |  505 -------
 contrib/libs/mimalloc/src/segment-map.c       |  155 ++
 contrib/libs/mimalloc/src/segment.c           |  954 ++++++-------
 contrib/libs/mimalloc/src/static.c            |   39 -
 contrib/libs/mimalloc/src/stats.c             |  331 ++---
 contrib/libs/mimalloc/ya.make                 |   50 +-
 40 files changed, 7527 insertions(+), 4801 deletions(-)
 create mode 100644 contrib/libs/mimalloc/SECURITY.md
 rename contrib/libs/mimalloc/include/{mimalloc-atomic.h => mimalloc/atomic.h} (83%)
 rename contrib/libs/mimalloc/include/{mimalloc-internal.h => mimalloc/internal.h} (64%)
 create mode 100644 contrib/libs/mimalloc/include/mimalloc/prim.h
 create mode 100644 contrib/libs/mimalloc/include/mimalloc/track.h
 rename contrib/libs/mimalloc/include/{mimalloc-types.h => mimalloc/types.h} (63%)
 delete mode 100644 contrib/libs/mimalloc/src/alloc-override-osx.c
 create mode 100644 contrib/libs/mimalloc/src/free.c
 create mode 100644 contrib/libs/mimalloc/src/libc.c
 create mode 100644 contrib/libs/mimalloc/src/prim/osx/alloc-override-zone.c
 create mode 100644 contrib/libs/mimalloc/src/prim/osx/prim.c
 create mode 100644 contrib/libs/mimalloc/src/prim/prim.c
 create mode 100644 contrib/libs/mimalloc/src/prim/unix/prim.c
 delete mode 100644 contrib/libs/mimalloc/src/region.c
 create mode 100644 contrib/libs/mimalloc/src/segment-map.c
 delete mode 100644 contrib/libs/mimalloc/src/static.c

diff --git a/contrib/libs/mimalloc/.yandex_meta/devtools.copyrights.report b/contrib/libs/mimalloc/.yandex_meta/devtools.copyrights.report
index a37ac7da01f8..4b220086813f 100644
--- a/contrib/libs/mimalloc/.yandex_meta/devtools.copyrights.report
+++ b/contrib/libs/mimalloc/.yandex_meta/devtools.copyrights.report
@@ -29,16 +29,6 @@
 # FILE_INCLUDE - include all file data into licenses text file
 # =======================
 
-KEEP     COPYRIGHT_SERVICE_LABEL 21dec668d9ab2431f46cc70979134ba7
-BELONGS ya.make
-    Note: matched license text is too long. Read it in the source files.
-    Scancode info:
-        Original SPDX id: COPYRIGHT_SERVICE_LABEL
-        Score           : 100.00
-        Match type      : COPYRIGHT
-    Files with this license:
-        src/bitmap.c [2:4]
-
 KEEP     COPYRIGHT_SERVICE_LABEL 25dcefb85a8e188fc5c56da58857f739
 BELONGS ya.make
     License text:
@@ -58,20 +48,14 @@ BELONGS ya.make
         Score           : 100.00
         Match type      : COPYRIGHT
     Files with this license:
-        include/mimalloc-internal.h [2:4]
-        include/mimalloc-types.h [2:4]
-        include/mimalloc.h [2:4]
         src/alloc-aligned.c [2:4]
         src/alloc-override.c [2:4]
         src/alloc-posix.c [2:4]
-        src/alloc.c [2:4]
         src/heap.c [2:4]
-        src/init.c [2:4]
         src/options.c [2:4]
-        src/os.c [2:4]
         src/stats.c [2:4]
 
-KEEP     COPYRIGHT_SERVICE_LABEL 28da6750f9f70938a34a2683265c5f37
+KEEP     COPYRIGHT_SERVICE_LABEL 5b7d847fe742e0704b8071bd0042a721
 BELONGS ya.make
     Note: matched license text is too long. Read it in the source files.
     Scancode info:
@@ -79,9 +63,10 @@ BELONGS ya.make
         Score           : 100.00
         Match type      : COPYRIGHT
     Files with this license:
-        include/mimalloc-atomic.h [2:4]
+        include/mimalloc-new-delete.h [2:4]
+        include/mimalloc-override.h [2:4]
 
-KEEP     COPYRIGHT_SERVICE_LABEL 4d891fec2fadb396208278a6d4280c2c
+KEEP     COPYRIGHT_SERVICE_LABEL 5f21aa30041415548b09b1e8e25da7fb
 BELONGS ya.make
     Note: matched license text is too long. Read it in the source files.
     Scancode info:
@@ -89,9 +74,10 @@ BELONGS ya.make
         Score           : 100.00
         Match type      : COPYRIGHT
     Files with this license:
-        src/bitmap.h [2:4]
+        src/arena.c [2:4]
+        src/segment-map.c [2:4]
 
-KEEP     COPYRIGHT_SERVICE_LABEL 5b7d847fe742e0704b8071bd0042a721
+KEEP     COPYRIGHT_SERVICE_LABEL 5fc5246a7da6971940f2a93100292b4d
 BELONGS ya.make
     Note: matched license text is too long. Read it in the source files.
     Scancode info:
@@ -99,10 +85,9 @@ BELONGS ya.make
         Score           : 100.00
         Match type      : COPYRIGHT
     Files with this license:
-        include/mimalloc-new-delete.h [2:4]
-        include/mimalloc-override.h [2:4]
+        src/random.c [2:4]
 
-KEEP     COPYRIGHT_SERVICE_LABEL 5fc5246a7da6971940f2a93100292b4d
+KEEP     COPYRIGHT_SERVICE_LABEL 8417b808fabacd093257b7972e1b7c8f
 BELONGS ya.make
     Note: matched license text is too long. Read it in the source files.
     Scancode info:
@@ -110,10 +95,20 @@ BELONGS ya.make
         Score           : 100.00
         Match type      : COPYRIGHT
     Files with this license:
-        src/arena.c [2:4]
-        src/random.c [2:4]
+        include/mimalloc/atomic.h [2:4]
+
+KEEP     COPYRIGHT_SERVICE_LABEL bb2ecc7d3573627ff5673fd4981120f7
+BELONGS ya.make
+    Note: matched license text is too long. Read it in the source files.
+    Scancode info:
+        Original SPDX id: COPYRIGHT_SERVICE_LABEL
+        Score           : 100.00
+        Match type      : COPYRIGHT
+    Files with this license:
+        src/bitmap.c [2:4]
+        src/bitmap.h [2:4]
 
-KEEP     COPYRIGHT_SERVICE_LABEL fe43a4aab9cf694378c07b2f43474c64
+KEEP     COPYRIGHT_SERVICE_LABEL e7f053f38ca4d796d9d56538f0ce1ac7
 BELONGS ya.make
     Note: matched license text is too long. Read it in the source files.
     Scancode info:
@@ -121,9 +116,17 @@ BELONGS ya.make
         Score           : 100.00
         Match type      : COPYRIGHT
     Files with this license:
-        src/region.c [2:4]
+        include/mimalloc.h [2:4]
+        include/mimalloc/internal.h [2:4]
+        include/mimalloc/prim.h [2:4]
+        include/mimalloc/track.h [2:4]
+        src/libc.c [2:4]
+        src/os.c [2:4]
+        src/prim/osx/prim.c [2:4]
+        src/prim/prim.c [2:4]
+        src/prim/unix/prim.c [2:4]
 
-KEEP     COPYRIGHT_SERVICE_LABEL feb05913c2d79921f57fa41bec01920f
+KEEP     COPYRIGHT_SERVICE_LABEL fc7c1095a8a64f3166ead9be4cfabb96
 BELONGS ya.make
     Note: matched license text is too long. Read it in the source files.
     Scancode info:
@@ -131,8 +134,20 @@ BELONGS ya.make
         Score           : 100.00
         Match type      : COPYRIGHT
     Files with this license:
-        src/alloc-override-osx.c [2:4]
+        include/mimalloc/types.h [2:4]
+        src/alloc.c [2:4]
+        src/free.c [2:4]
         src/page-queue.c [2:4]
         src/page.c [2:4]
         src/segment.c [2:4]
-        src/static.c [2:4]
+
+KEEP     COPYRIGHT_SERVICE_LABEL fe63ec86a6a35162f9131f69ba4bc3e6
+BELONGS ya.make
+    Note: matched license text is too long. Read it in the source files.
+    Scancode info:
+        Original SPDX id: COPYRIGHT_SERVICE_LABEL
+        Score           : 100.00
+        Match type      : COPYRIGHT
+    Files with this license:
+        src/init.c [2:4]
+        src/prim/osx/alloc-override-zone.c [2:4]
diff --git a/contrib/libs/mimalloc/.yandex_meta/devtools.licenses.report b/contrib/libs/mimalloc/.yandex_meta/devtools.licenses.report
index 7bfbc229ab11..6791efdfc34f 100644
--- a/contrib/libs/mimalloc/.yandex_meta/devtools.licenses.report
+++ b/contrib/libs/mimalloc/.yandex_meta/devtools.licenses.report
@@ -31,7 +31,6 @@
 
 SKIP     LicenseRef-scancode-generic-cla 0539c29f2b403f650800fcba3b1c53a6
 BELONGS ya.make
-    # Contributor License Agreement
     License text:
         Contributor License Agreement (CLA) declaring that you have the right to, and actually do, grant us
     Scancode info:
@@ -40,11 +39,11 @@ BELONGS ya.make
         Match type      : NOTICE
         Links           : https://github.com/nexB/scancode-toolkit/tree/develop/src/licensedcode/data/licenses/generic-cla.LICENSE
     Files with this license:
-        readme.md [680:680]
+        readme.md [816:816]
 
 KEEP     MIT AND MIT          2d229fcf116e5a7facba2d4dcccf15ba
 BELONGS ya.make
-FILE_INCLUDE LICENSE found in files: include/mimalloc-internal.h at line 5, include/mimalloc-types.h at line 5, include/mimalloc.h at line 5, src/alloc-aligned.c at line 5, src/alloc.c at line 5, src/options.c at line 5
+FILE_INCLUDE LICENSE found in files: include/mimalloc.h at line 5, include/mimalloc/atomic.h at line 5, include/mimalloc/internal.h at line 5, include/mimalloc/track.h at line 5, src/init.c at line 5, src/os.c at line 5
     License text:
         This is free software; you can redistribute it and/or modify it under the
         terms of the MIT license. A copy of the license can be found in the file
@@ -55,24 +54,30 @@ FILE_INCLUDE LICENSE found in files: include/mimalloc-internal.h at line 5, incl
         Match type      : NOTICE
         Links           : http://opensource.org/licenses/mit-license.php, https://spdx.org/licenses/MIT
     Files with this license:
-        include/mimalloc-atomic.h [3:5]
         include/mimalloc-new-delete.h [3:5]
         include/mimalloc-override.h [3:5]
-        src/alloc-override-osx.c [3:5]
+        include/mimalloc/prim.h [3:5]
+        include/mimalloc/types.h [3:5]
+        src/alloc-aligned.c [3:5]
         src/alloc-override.c [3:5]
         src/alloc-posix.c [3:5]
+        src/alloc.c [3:5]
         src/arena.c [3:5]
         src/bitmap.c [3:5]
         src/bitmap.h [3:5]
+        src/free.c [3:5]
         src/heap.c [3:5]
-        src/init.c [3:5]
-        src/os.c [3:5]
+        src/libc.c [3:5]
+        src/options.c [3:5]
         src/page-queue.c [3:5]
         src/page.c [3:5]
+        src/prim/osx/alloc-override-zone.c [3:5]
+        src/prim/osx/prim.c [3:5]
+        src/prim/prim.c [3:5]
+        src/prim/unix/prim.c [3:5]
         src/random.c [3:5]
-        src/region.c [3:5]
+        src/segment-map.c [3:5]
         src/segment.c [3:5]
-        src/static.c [3:5]
         src/stats.c [3:5]
     Scancode info:
         Original SPDX id: MIT
@@ -80,12 +85,12 @@ FILE_INCLUDE LICENSE found in files: include/mimalloc-internal.h at line 5, incl
         Match type      : NOTICE
         Links           : http://opensource.org/licenses/mit-license.php, https://spdx.org/licenses/MIT
     Files with this license:
-        include/mimalloc-internal.h [3:5]
-        include/mimalloc-types.h [3:5]
         include/mimalloc.h [3:5]
-        src/alloc-aligned.c [3:5]
-        src/alloc.c [3:5]
-        src/options.c [3:5]
+        include/mimalloc/atomic.h [3:5]
+        include/mimalloc/internal.h [3:5]
+        include/mimalloc/track.h [3:5]
+        src/init.c [3:5]
+        src/os.c [3:5]
 
 KEEP     MIT                  399584035c417b91040964779555dfac
 BELONGS ya.make
@@ -99,6 +104,20 @@ BELONGS ya.make
     Files with this license:
         LICENSE [1:1]
 
+KEEP     MIT                  46d3a844e933821ebdb401dff1a34bc0
+BELONGS ya.make
+    License text:
+        This is free software; you can redistribute it and/or modify it under the
+        terms of the MIT license. A copy of the license can be found in the file
+    Scancode info:
+        Original SPDX id: BSD-3-Clause
+        Score           : 52.38
+        Match type      : NOTICE
+        Links           : http://www.opensource.org/licenses/BSD-3-Clause, https://spdx.org/licenses/BSD-3-Clause
+    Files with this license:
+        include/mimalloc/atomic.h [3:4]
+        include/mimalloc/track.h [3:4]
+
 KEEP     MIT                  54575e81a786e9aa7d98337ec2e1ebb0
 BELONGS ya.make
     Note: matched license text is too long. Read it in the source files.
diff --git a/contrib/libs/mimalloc/.yandex_meta/licenses.list.txt b/contrib/libs/mimalloc/.yandex_meta/licenses.list.txt
index 3408919d8b38..69648edbd5d3 100644
--- a/contrib/libs/mimalloc/.yandex_meta/licenses.list.txt
+++ b/contrib/libs/mimalloc/.yandex_meta/licenses.list.txt
@@ -5,47 +5,53 @@ terms of the MIT license. A copy of the license can be found in the file
 
 
 ====================COPYRIGHT====================
-Copyright (c) 2018-2020, Microsoft Research, Daan Leijen
+Copyright (c) 2018-2021 Microsoft Corporation, Daan Leijen
+
+
+====================COPYRIGHT====================
+Copyright (c) 2018-2021, Microsoft Research, Daan Leijen
 This is free software; you can redistribute it and/or modify it under the
 terms of the MIT license. A copy of the license can be found in the file
 
 
 ====================COPYRIGHT====================
-Copyright (c) 2018-2021 Microsoft Corporation, Daan Leijen
+Copyright (c) 2018-2022, Microsoft Research, Daan Leijen
+This is free software; you can redistribute it and/or modify it under the
+terms of the MIT license. A copy of the license can be found in the file
 
 
 ====================COPYRIGHT====================
-Copyright (c) 2018-2021 Microsoft Research, Daan Leijen
+Copyright (c) 2018-2023 Microsoft Research, Daan Leijen
 This is free software; you can redistribute it and/or modify it under the
 terms of the MIT license. A copy of the license can be found in the file
 
 
 ====================COPYRIGHT====================
-Copyright (c) 2018-2021, Microsoft Research, Daan Leijen
+Copyright (c) 2018-2023, Microsoft Research, Daan Leijen
 This is free software; you can redistribute it and/or modify it under the
 terms of the MIT license. A copy of the license can be found in the file
 
 
 ====================COPYRIGHT====================
-Copyright (c) 2019-2020 Microsoft Research, Daan Leijen
+Copyright (c) 2018-2024, Microsoft Research, Daan Leijen
 This is free software; you can redistribute it and/or modify it under the
 terms of the MIT license. A copy of the license can be found in the file
 
 
 ====================COPYRIGHT====================
-Copyright (c) 2019-2020, Microsoft Research, Daan Leijen
+Copyright (c) 2019-2021, Microsoft Research, Daan Leijen
 This is free software; you can redistribute it and/or modify it under the
 terms of the MIT license. A copy of the license can be found in the file
 
 
 ====================COPYRIGHT====================
-Copyright (c) 2019-2021 Microsoft Research, Daan Leijen
+Copyright (c) 2019-2023 Microsoft Research, Daan Leijen
 This is free software; you can redistribute it and/or modify it under the
 terms of the MIT license. A copy of the license can be found in the file
 
 
 ====================COPYRIGHT====================
-Copyright (c) 2019-2021, Microsoft Research, Daan Leijen
+Copyright (c) 2019-2023, Microsoft Research, Daan Leijen
 This is free software; you can redistribute it and/or modify it under the
 terms of the MIT license. A copy of the license can be found in the file
 
@@ -97,6 +103,11 @@ LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
 OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
 SOFTWARE.
 
+====================MIT====================
+This is free software; you can redistribute it and/or modify it under the
+terms of the MIT license. A copy of the license can be found in the file
+
+
 ====================MIT AND MIT====================
 This is free software; you can redistribute it and/or modify it under the
 terms of the MIT license. A copy of the license can be found in the file
diff --git a/contrib/libs/mimalloc/SECURITY.md b/contrib/libs/mimalloc/SECURITY.md
new file mode 100644
index 000000000000..b3c89efc852e
--- /dev/null
+++ b/contrib/libs/mimalloc/SECURITY.md
@@ -0,0 +1,41 @@
+<!-- BEGIN MICROSOFT SECURITY.MD V0.0.9 BLOCK -->
+
+## Security
+
+Microsoft takes the security of our software products and services seriously, which includes all source code repositories managed through our GitHub organizations, which include [Microsoft](https://github.com/Microsoft), [Azure](https://github.com/Azure), [DotNet](https://github.com/dotnet), [AspNet](https://github.com/aspnet) and [Xamarin](https://github.com/xamarin).
+
+If you believe you have found a security vulnerability in any Microsoft-owned repository that meets [Microsoft's definition of a security vulnerability](https://aka.ms/security.md/definition), please report it to us as described below.
+
+## Reporting Security Issues
+
+**Please do not report security vulnerabilities through public GitHub issues.**
+
+Instead, please report them to the Microsoft Security Response Center (MSRC) at [https://msrc.microsoft.com/create-report](https://aka.ms/security.md/msrc/create-report).
+
+If you prefer to submit without logging in, send email to [secure@microsoft.com](mailto:secure@microsoft.com).  If possible, encrypt your message with our PGP key; please download it from the [Microsoft Security Response Center PGP Key page](https://aka.ms/security.md/msrc/pgp).
+
+You should receive a response within 24 hours. If for some reason you do not, please follow up via email to ensure we received your original message. Additional information can be found at [microsoft.com/msrc](https://www.microsoft.com/msrc). 
+
+Please include the requested information listed below (as much as you can provide) to help us better understand the nature and scope of the possible issue:
+
+  * Type of issue (e.g. buffer overflow, SQL injection, cross-site scripting, etc.)
+  * Full paths of source file(s) related to the manifestation of the issue
+  * The location of the affected source code (tag/branch/commit or direct URL)
+  * Any special configuration required to reproduce the issue
+  * Step-by-step instructions to reproduce the issue
+  * Proof-of-concept or exploit code (if possible)
+  * Impact of the issue, including how an attacker might exploit the issue
+
+This information will help us triage your report more quickly.
+
+If you are reporting for a bug bounty, more complete reports can contribute to a higher bounty award. Please visit our [Microsoft Bug Bounty Program](https://aka.ms/security.md/msrc/bounty) page for more details about our active programs.
+
+## Preferred Languages
+
+We prefer all communications to be in English.
+
+## Policy
+
+Microsoft follows the principle of [Coordinated Vulnerability Disclosure](https://aka.ms/security.md/cvd).
+
+<!-- END MICROSOFT SECURITY.MD BLOCK -->
diff --git a/contrib/libs/mimalloc/include/mimalloc-new-delete.h b/contrib/libs/mimalloc/include/mimalloc-new-delete.h
index ba208f05569b..c16f4a6653d9 100644
--- a/contrib/libs/mimalloc/include/mimalloc-new-delete.h
+++ b/contrib/libs/mimalloc/include/mimalloc-new-delete.h
@@ -22,14 +22,26 @@ terms of the MIT license. A copy of the license can be found in the file
   #include <new>
   #include <mimalloc.h>
 
+  #if defined(_MSC_VER) && defined(_Ret_notnull_) && defined(_Post_writable_byte_size_)
+  // stay consistent with VCRT definitions
+  #define mi_decl_new(n)          mi_decl_nodiscard mi_decl_restrict _Ret_notnull_ _Post_writable_byte_size_(n)
+  #define mi_decl_new_nothrow(n)  mi_decl_nodiscard mi_decl_restrict _Ret_maybenull_ _Success_(return != NULL) _Post_writable_byte_size_(n)
+  #else
+  #define mi_decl_new(n)          mi_decl_nodiscard mi_decl_restrict
+  #define mi_decl_new_nothrow(n)  mi_decl_nodiscard mi_decl_restrict
+  #endif
+
   void operator delete(void* p) noexcept              { mi_free(p); };
   void operator delete[](void* p) noexcept            { mi_free(p); };
 
-  void* operator new(std::size_t n) noexcept(false)   { return mi_new(n); }
-  void* operator new[](std::size_t n) noexcept(false) { return mi_new(n); }
+  void operator delete  (void* p, const std::nothrow_t&) noexcept { mi_free(p); }
+  void operator delete[](void* p, const std::nothrow_t&) noexcept { mi_free(p); }
+
+  mi_decl_new(n) void* operator new(std::size_t n) noexcept(false) { return mi_new(n); }
+  mi_decl_new(n) void* operator new[](std::size_t n) noexcept(false) { return mi_new(n); }
 
-  void* operator new  (std::size_t n, const std::nothrow_t& tag) noexcept { (void)(tag); return mi_new_nothrow(n); }
-  void* operator new[](std::size_t n, const std::nothrow_t& tag) noexcept { (void)(tag); return mi_new_nothrow(n); }
+  mi_decl_new_nothrow(n) void* operator new  (std::size_t n, const std::nothrow_t& tag) noexcept { (void)(tag); return mi_new_nothrow(n); }
+  mi_decl_new_nothrow(n) void* operator new[](std::size_t n, const std::nothrow_t& tag) noexcept { (void)(tag); return mi_new_nothrow(n); }
 
   #if (__cplusplus >= 201402L || _MSC_VER >= 1916)
   void operator delete  (void* p, std::size_t n) noexcept { mi_free_size(p,n); };
@@ -41,9 +53,11 @@ terms of the MIT license. A copy of the license can be found in the file
   void operator delete[](void* p, std::align_val_t al) noexcept { mi_free_aligned(p, static_cast<size_t>(al)); }
   void operator delete  (void* p, std::size_t n, std::align_val_t al) noexcept { mi_free_size_aligned(p, n, static_cast<size_t>(al)); };
   void operator delete[](void* p, std::size_t n, std::align_val_t al) noexcept { mi_free_size_aligned(p, n, static_cast<size_t>(al)); };
+  void operator delete  (void* p, std::align_val_t al, const std::nothrow_t&) noexcept { mi_free_aligned(p, static_cast<size_t>(al)); }
+  void operator delete[](void* p, std::align_val_t al, const std::nothrow_t&) noexcept { mi_free_aligned(p, static_cast<size_t>(al)); }
 
-  void* operator new( std::size_t n, std::align_val_t al)   noexcept(false) { return mi_new_aligned(n, static_cast<size_t>(al)); }
-  void* operator new[]( std::size_t n, std::align_val_t al) noexcept(false) { return mi_new_aligned(n, static_cast<size_t>(al)); }
+  void* operator new  (std::size_t n, std::align_val_t al) noexcept(false) { return mi_new_aligned(n, static_cast<size_t>(al)); }
+  void* operator new[](std::size_t n, std::align_val_t al) noexcept(false) { return mi_new_aligned(n, static_cast<size_t>(al)); }
   void* operator new  (std::size_t n, std::align_val_t al, const std::nothrow_t&) noexcept { return mi_new_aligned_nothrow(n, static_cast<size_t>(al)); }
   void* operator new[](std::size_t n, std::align_val_t al, const std::nothrow_t&) noexcept { return mi_new_aligned_nothrow(n, static_cast<size_t>(al)); }
   #endif
diff --git a/contrib/libs/mimalloc/include/mimalloc-override.h b/contrib/libs/mimalloc/include/mimalloc-override.h
index 7d9f3e7d0ccc..48a8a6226a05 100644
--- a/contrib/libs/mimalloc/include/mimalloc-override.h
+++ b/contrib/libs/mimalloc/include/mimalloc-override.h
@@ -24,7 +24,7 @@ not accidentally mix pointers from different allocators).
 #define free(p)                 mi_free(p)
 
 #define strdup(s)               mi_strdup(s)
-#define strndup(s,n)              mi_strndup(s,n)
+#define strndup(s,n)            mi_strndup(s,n)
 #define realpath(f,n)           mi_realpath(f,n)
 
 // Microsoft extensions
@@ -43,11 +43,13 @@ not accidentally mix pointers from different allocators).
 #define reallocf(p,n)           mi_reallocf(p,n)
 #define malloc_size(p)          mi_usable_size(p)
 #define malloc_usable_size(p)   mi_usable_size(p)
+#define malloc_good_size(sz)    mi_malloc_good_size(sz)
 #define cfree(p)                mi_free(p)
 
 #define valloc(n)               mi_valloc(n)
 #define pvalloc(n)              mi_pvalloc(n)
 #define reallocarray(p,s,n)     mi_reallocarray(p,s,n)
+#define reallocarr(p,s,n)       mi_reallocarr(p,s,n)
 #define memalign(a,n)           mi_memalign(a,n)
 #define aligned_alloc(a,n)      mi_aligned_alloc(a,n)
 #define posix_memalign(p,a,n)   mi_posix_memalign(p,a,n)
diff --git a/contrib/libs/mimalloc/include/mimalloc.h b/contrib/libs/mimalloc/include/mimalloc.h
index fe5aa8f34355..ae6f99b4b465 100644
--- a/contrib/libs/mimalloc/include/mimalloc.h
+++ b/contrib/libs/mimalloc/include/mimalloc.h
@@ -1,5 +1,5 @@
 /* ----------------------------------------------------------------------------
-Copyright (c) 2018-2021, Microsoft Research, Daan Leijen
+Copyright (c) 2018-2023, Microsoft Research, Daan Leijen
 This is free software; you can redistribute it and/or modify it under the
 terms of the MIT license. A copy of the license can be found in the file
 "LICENSE" at the root of this distribution.
@@ -8,7 +8,7 @@ terms of the MIT license. A copy of the license can be found in the file
 #ifndef MIMALLOC_H
 #define MIMALLOC_H
 
-#define MI_MALLOC_VERSION 171   // major + 2 digits minor
+#define MI_MALLOC_VERSION 187   // major + 2 digits minor
 
 // ------------------------------------------------------
 // Compiler specific attributes
@@ -26,8 +26,10 @@ terms of the MIT license. A copy of the license can be found in the file
 
 #if defined(__cplusplus) && (__cplusplus >= 201703)
   #define mi_decl_nodiscard    [[nodiscard]]
-#elif (__GNUC__ >= 4) || defined(__clang__)  // includes clang, icc, and clang-cl
+#elif (defined(__GNUC__) && (__GNUC__ >= 4)) || defined(__clang__)  // includes clang, icc, and clang-cl
   #define mi_decl_nodiscard    __attribute__((warn_unused_result))
+#elif defined(_HAS_NODISCARD)
+  #define mi_decl_nodiscard    _NODISCARD
 #elif (_MSC_VER >= 1700)
   #define mi_decl_nodiscard    _Check_return_
 #else
@@ -58,8 +60,12 @@ terms of the MIT license. A copy of the license can be found in the file
   #define mi_attr_alloc_size2(s1,s2)
   #define mi_attr_alloc_align(p)
 #elif defined(__GNUC__)                 // includes clang and icc
+  #if defined(MI_SHARED_LIB) && defined(MI_SHARED_LIB_EXPORT)
+    #define mi_decl_export              __attribute__((visibility("default")))
+  #else
+    #define mi_decl_export
+  #endif
   #define mi_cdecl                      // leads to warnings... __attribute__((cdecl))
-  #define mi_decl_export                __attribute__((visibility("default")))
   #define mi_decl_restrict
   #define mi_attr_malloc                __attribute__((malloc))
   #if (defined(__clang_major__) && (__clang_major__ < 4)) || (__GNUC__ < 5)
@@ -153,8 +159,8 @@ mi_decl_export void mi_thread_init(void)      mi_attr_noexcept;
 mi_decl_export void mi_thread_done(void)      mi_attr_noexcept;
 mi_decl_export void mi_thread_stats_print_out(mi_output_fun* out, void* arg) mi_attr_noexcept;
 
-mi_decl_export void mi_process_info(size_t* elapsed_msecs, size_t* user_msecs, size_t* system_msecs, 
-                                    size_t* current_rss, size_t* peak_rss, 
+mi_decl_export void mi_process_info(size_t* elapsed_msecs, size_t* user_msecs, size_t* system_msecs,
+                                    size_t* current_rss, size_t* peak_rss,
                                     size_t* current_commit, size_t* peak_commit, size_t* page_faults) mi_attr_noexcept;
 
 // -------------------------------------------------------------------------------------
@@ -249,8 +255,9 @@ typedef struct mi_heap_area_s {
   void*  blocks;      // start of the area containing heap blocks
   size_t reserved;    // bytes reserved for this area (virtual)
   size_t committed;   // current available bytes for this area
-  size_t used;        // bytes in use by allocated blocks
+  size_t used;        // number of allocated blocks
   size_t block_size;  // size in bytes of each block
+  size_t full_block_size; // size in bytes of a full block including padding and metadata.
 } mi_heap_area_t;
 
 typedef bool (mi_cdecl mi_block_visit_fun)(const mi_heap_t* heap, const mi_heap_area_t* area, void* block, size_t block_size, void* arg);
@@ -267,6 +274,19 @@ mi_decl_export int mi_reserve_huge_os_pages_at(size_t pages, int numa_node, size
 mi_decl_export int  mi_reserve_os_memory(size_t size, bool commit, bool allow_large) mi_attr_noexcept;
 mi_decl_export bool mi_manage_os_memory(void* start, size_t size, bool is_committed, bool is_large, bool is_zero, int numa_node) mi_attr_noexcept;
 
+mi_decl_export void mi_debug_show_arenas(bool show_inuse, bool show_abandoned, bool show_purge) mi_attr_noexcept;
+
+// Experimental: heaps associated with specific memory arena's
+typedef int mi_arena_id_t;
+mi_decl_export void* mi_arena_area(mi_arena_id_t arena_id, size_t* size);
+mi_decl_export int   mi_reserve_huge_os_pages_at_ex(size_t pages, int numa_node, size_t timeout_msecs, bool exclusive, mi_arena_id_t* arena_id) mi_attr_noexcept;
+mi_decl_export int   mi_reserve_os_memory_ex(size_t size, bool commit, bool allow_large, bool exclusive, mi_arena_id_t* arena_id) mi_attr_noexcept;
+mi_decl_export bool  mi_manage_os_memory_ex(void* start, size_t size, bool is_committed, bool is_large, bool is_zero, int numa_node, bool exclusive, mi_arena_id_t* arena_id) mi_attr_noexcept;
+
+#if MI_MALLOC_VERSION >= 182
+// Create a heap that only allocates in the specified arena
+mi_decl_nodiscard mi_decl_export mi_heap_t* mi_heap_new_in_arena(mi_arena_id_t arena_id);
+#endif
 
 // deprecated
 mi_decl_export int  mi_reserve_huge_os_pages(size_t pages, double max_secs, size_t* pages_reserved) mi_attr_noexcept;
@@ -292,33 +312,49 @@ mi_decl_export int  mi_reserve_huge_os_pages(size_t pages, double max_secs, size
 
 
 // ------------------------------------------------------
-// Options, all `false` by default
+// Options
 // ------------------------------------------------------
 
 typedef enum mi_option_e {
   // stable options
-  mi_option_show_errors,
-  mi_option_show_stats,
-  mi_option_verbose,
-  // the following options are experimental
-  mi_option_eager_commit,
-  mi_option_eager_region_commit,
-  mi_option_reset_decommits,
-  mi_option_large_os_pages,         // implies eager commit
-  mi_option_reserve_huge_os_pages,
-  mi_option_reserve_os_memory,
-  mi_option_segment_cache,
-  mi_option_page_reset,
-  mi_option_abandoned_page_reset,
-  mi_option_segment_reset,
-  mi_option_eager_commit_delay,
-  mi_option_reset_delay,
-  mi_option_use_numa_nodes,
-  mi_option_limit_os_alloc,
-  mi_option_os_tag,
-  mi_option_max_errors,
-  mi_option_max_warnings,
-  _mi_option_last
+  mi_option_show_errors,                // print error messages
+  mi_option_show_stats,                 // print statistics on termination
+  mi_option_verbose,                    // print verbose messages
+  // advanced options
+  mi_option_eager_commit,               // eager commit segments? (after `eager_commit_delay` segments) (=1)
+  mi_option_arena_eager_commit,         // eager commit arenas? Use 2 to enable just on overcommit systems (=2)
+  mi_option_purge_decommits,            // should a memory purge decommit? (=1). Set to 0 to use memory reset on a purge (instead of decommit)
+  mi_option_allow_large_os_pages,       // allow large (2 or 4 MiB) OS pages, implies eager commit. If false, also disables THP for the process.
+  mi_option_reserve_huge_os_pages,      // reserve N huge OS pages (1GiB pages) at startup
+  mi_option_reserve_huge_os_pages_at,   // reserve huge OS pages at a specific NUMA node
+  mi_option_reserve_os_memory,          // reserve specified amount of OS memory in an arena at startup (internally, this value is in KiB; use `mi_option_get_size`)
+  mi_option_deprecated_segment_cache,
+  mi_option_deprecated_page_reset,
+  mi_option_abandoned_page_purge,       // immediately purge delayed purges on thread termination
+  mi_option_deprecated_segment_reset, 
+  mi_option_eager_commit_delay,         // the first N segments per thread are not eagerly committed (but per page in the segment on demand)
+  mi_option_purge_delay,                // memory purging is delayed by N milli seconds; use 0 for immediate purging or -1 for no purging at all. (=10)
+  mi_option_use_numa_nodes,             // 0 = use all available numa nodes, otherwise use at most N nodes.
+  mi_option_disallow_os_alloc,          // 1 = do not use OS memory for allocation (but only programmatically reserved arenas)
+  mi_option_os_tag,                     // tag used for OS logging (macOS only for now) (=100)
+  mi_option_max_errors,                 // issue at most N error messages
+  mi_option_max_warnings,               // issue at most N warning messages
+  mi_option_max_segment_reclaim,        // max. percentage of the abandoned segments can be reclaimed per try (=10%)
+  mi_option_destroy_on_exit,            // if set, release all memory on exit; sometimes used for dynamic unloading but can be unsafe
+  mi_option_arena_reserve,              // initial memory size for arena reservation (= 1 GiB on 64-bit) (internally, this value is in KiB; use `mi_option_get_size`)
+  mi_option_arena_purge_mult,           // multiplier for `purge_delay` for the purging delay for arenas (=10)
+  mi_option_purge_extend_delay,
+  mi_option_abandoned_reclaim_on_free,  // allow to reclaim an abandoned segment on a free (=1)
+  mi_option_disallow_arena_alloc,       // 1 = do not use arena's for allocation (except if using specific arena id's)
+  mi_option_retry_on_oom,               // retry on out-of-memory for N milli seconds (=400), set to 0 to disable retries. (only on windows)
+  _mi_option_last,
+  // legacy option names
+  mi_option_large_os_pages = mi_option_allow_large_os_pages,
+  mi_option_eager_region_commit = mi_option_arena_eager_commit,
+  mi_option_reset_decommits = mi_option_purge_decommits,
+  mi_option_reset_delay = mi_option_purge_delay,
+  mi_option_abandoned_page_reset = mi_option_abandoned_page_purge,
+  mi_option_limit_os_alloc = mi_option_disallow_os_alloc
 } mi_option_t;
 
 
@@ -328,7 +364,9 @@ mi_decl_export void mi_option_disable(mi_option_t option);
 mi_decl_export void mi_option_set_enabled(mi_option_t option, bool enable);
 mi_decl_export void mi_option_set_enabled_default(mi_option_t option, bool enable);
 
-mi_decl_nodiscard mi_decl_export long mi_option_get(mi_option_t option);
+mi_decl_nodiscard mi_decl_export long   mi_option_get(mi_option_t option);
+mi_decl_nodiscard mi_decl_export long   mi_option_get_clamp(mi_option_t option, long min, long max);
+mi_decl_nodiscard mi_decl_export size_t mi_option_get_size(mi_option_t option);
 mi_decl_export void mi_option_set(mi_option_t option, long value);
 mi_decl_export void mi_option_set_default(mi_option_t option, long value);
 
@@ -342,6 +380,7 @@ mi_decl_export void mi_option_set_default(mi_option_t option, long value);
 mi_decl_export void  mi_cfree(void* p) mi_attr_noexcept;
 mi_decl_export void* mi__expand(void* p, size_t newsize) mi_attr_noexcept;
 mi_decl_nodiscard mi_decl_export size_t mi_malloc_size(const void* p)        mi_attr_noexcept;
+mi_decl_nodiscard mi_decl_export size_t mi_malloc_good_size(size_t size)     mi_attr_noexcept;
 mi_decl_nodiscard mi_decl_export size_t mi_malloc_usable_size(const void *p) mi_attr_noexcept;
 
 mi_decl_export int mi_posix_memalign(void** p, size_t alignment, size_t size)   mi_attr_noexcept;
@@ -351,6 +390,7 @@ mi_decl_nodiscard mi_decl_export mi_decl_restrict void* mi_pvalloc(size_t size)
 mi_decl_nodiscard mi_decl_export mi_decl_restrict void* mi_aligned_alloc(size_t alignment, size_t size) mi_attr_noexcept mi_attr_malloc mi_attr_alloc_size(2) mi_attr_alloc_align(1);
 
 mi_decl_nodiscard mi_decl_export void* mi_reallocarray(void* p, size_t count, size_t size) mi_attr_noexcept mi_attr_alloc_size2(2,3);
+mi_decl_nodiscard mi_decl_export int   mi_reallocarr(void* p, size_t count, size_t size) mi_attr_noexcept;
 mi_decl_nodiscard mi_decl_export void* mi_aligned_recalloc(void* p, size_t newcount, size_t size, size_t alignment) mi_attr_noexcept;
 mi_decl_nodiscard mi_decl_export void* mi_aligned_offset_recalloc(void* p, size_t newcount, size_t size, size_t alignment, size_t offset) mi_attr_noexcept;
 
@@ -373,6 +413,9 @@ mi_decl_nodiscard mi_decl_export mi_decl_restrict void* mi_new_n(size_t count, s
 mi_decl_nodiscard mi_decl_export void* mi_new_realloc(void* p, size_t newsize)                mi_attr_alloc_size(2);
 mi_decl_nodiscard mi_decl_export void* mi_new_reallocn(void* p, size_t newcount, size_t size) mi_attr_alloc_size2(2, 3);
 
+mi_decl_nodiscard mi_decl_export mi_decl_restrict void* mi_heap_alloc_new(mi_heap_t* heap, size_t size)                mi_attr_malloc mi_attr_alloc_size(2);
+mi_decl_nodiscard mi_decl_export mi_decl_restrict void* mi_heap_alloc_new_n(mi_heap_t* heap, size_t count, size_t size) mi_attr_malloc mi_attr_alloc_size2(2, 3);
+
 #ifdef __cplusplus
 }
 #endif
@@ -383,13 +426,14 @@ mi_decl_nodiscard mi_decl_export void* mi_new_reallocn(void* p, size_t newcount,
 // ---------------------------------------------------------------------------------------------
 #ifdef __cplusplus
 
+#include <cstddef>     // std::size_t
 #include <cstdint>     // PTRDIFF_MAX
 #if (__cplusplus >= 201103L) || (_MSC_VER > 1900)  // C++11
 #include <type_traits> // std::true_type
 #include <utility>     // std::forward
 #endif
 
-template<class T> struct mi_stl_allocator {
+template<class T> struct _mi_stl_allocator_common {
   typedef T                 value_type;
   typedef std::size_t       size_type;
   typedef std::ptrdiff_t    difference_type;
@@ -397,6 +441,27 @@ template<class T> struct mi_stl_allocator {
   typedef value_type const& const_reference;
   typedef value_type*       pointer;
   typedef value_type const* const_pointer;
+
+  #if ((__cplusplus >= 201103L) || (_MSC_VER > 1900))  // C++11
+  using propagate_on_container_copy_assignment = std::true_type;
+  using propagate_on_container_move_assignment = std::true_type;
+  using propagate_on_container_swap            = std::true_type;
+  template <class U, class ...Args> void construct(U* p, Args&& ...args) { ::new(p) U(std::forward<Args>(args)...); }
+  template <class U> void destroy(U* p) mi_attr_noexcept { p->~U(); }
+  #else
+  void construct(pointer p, value_type const& val) { ::new(p) value_type(val); }
+  void destroy(pointer p) { p->~value_type(); }
+  #endif
+
+  size_type     max_size() const mi_attr_noexcept { return (PTRDIFF_MAX/sizeof(value_type)); }
+  pointer       address(reference x) const        { return &x; }
+  const_pointer address(const_reference x) const  { return &x; }
+};
+
+template<class T> struct mi_stl_allocator : public _mi_stl_allocator_common<T> {
+  using typename _mi_stl_allocator_common<T>::size_type;
+  using typename _mi_stl_allocator_common<T>::value_type;
+  using typename _mi_stl_allocator_common<T>::pointer;
   template <class U> struct rebind { typedef mi_stl_allocator<U> other; };
 
   mi_stl_allocator()                                             mi_attr_noexcept = default;
@@ -413,24 +478,91 @@ template<class T> struct mi_stl_allocator {
   #endif
 
   #if ((__cplusplus >= 201103L) || (_MSC_VER > 1900))  // C++11
-  using propagate_on_container_copy_assignment = std::true_type;
-  using propagate_on_container_move_assignment = std::true_type;
-  using propagate_on_container_swap            = std::true_type;
-  using is_always_equal                        = std::true_type;
-  template <class U, class ...Args> void construct(U* p, Args&& ...args) { ::new(p) U(std::forward<Args>(args)...); }
-  template <class U> void destroy(U* p) mi_attr_noexcept { p->~U(); }
-  #else
-  void construct(pointer p, value_type const& val) { ::new(p) value_type(val); }
-  void destroy(pointer p) { p->~value_type(); }
+  using is_always_equal = std::true_type;
   #endif
-
-  size_type     max_size() const mi_attr_noexcept { return (PTRDIFF_MAX/sizeof(value_type)); }
-  pointer       address(reference x) const        { return &x; }
-  const_pointer address(const_reference x) const  { return &x; }
 };
 
 template<class T1,class T2> bool operator==(const mi_stl_allocator<T1>& , const mi_stl_allocator<T2>& ) mi_attr_noexcept { return true; }
 template<class T1,class T2> bool operator!=(const mi_stl_allocator<T1>& , const mi_stl_allocator<T2>& ) mi_attr_noexcept { return false; }
+
+
+#if (__cplusplus >= 201103L) || (_MSC_VER >= 1900)  // C++11
+#define MI_HAS_HEAP_STL_ALLOCATOR 1
+
+#include <memory>      // std::shared_ptr
+
+// Common base class for STL allocators in a specific heap
+template<class T, bool _mi_destroy> struct _mi_heap_stl_allocator_common : public _mi_stl_allocator_common<T> {
+  using typename _mi_stl_allocator_common<T>::size_type;
+  using typename _mi_stl_allocator_common<T>::value_type;
+  using typename _mi_stl_allocator_common<T>::pointer;
+
+  _mi_heap_stl_allocator_common(mi_heap_t* hp) : heap(hp, [](mi_heap_t*) {}) {}    /* will not delete nor destroy the passed in heap */
+
+  #if (__cplusplus >= 201703L)  // C++17
+  mi_decl_nodiscard T* allocate(size_type count) { return static_cast<T*>(mi_heap_alloc_new_n(this->heap.get(), count, sizeof(T))); }
+  mi_decl_nodiscard T* allocate(size_type count, const void*) { return allocate(count); }
+  #else
+  mi_decl_nodiscard pointer allocate(size_type count, const void* = 0) { return static_cast<pointer>(mi_heap_alloc_new_n(this->heap.get(), count, sizeof(value_type))); }
+  #endif
+
+  #if ((__cplusplus >= 201103L) || (_MSC_VER > 1900))  // C++11
+  using is_always_equal = std::false_type;
+  #endif
+
+  void collect(bool force) { mi_heap_collect(this->heap.get(), force); }
+  template<class U> bool is_equal(const _mi_heap_stl_allocator_common<U, _mi_destroy>& x) const { return (this->heap == x.heap); }
+
+protected:
+  std::shared_ptr<mi_heap_t> heap;
+  template<class U, bool D> friend struct _mi_heap_stl_allocator_common;
+  
+  _mi_heap_stl_allocator_common() {
+    mi_heap_t* hp = mi_heap_new();
+    this->heap.reset(hp, (_mi_destroy ? &heap_destroy : &heap_delete));  /* calls heap_delete/destroy when the refcount drops to zero */
+  }
+  _mi_heap_stl_allocator_common(const _mi_heap_stl_allocator_common& x) mi_attr_noexcept : heap(x.heap) { }
+  template<class U> _mi_heap_stl_allocator_common(const _mi_heap_stl_allocator_common<U, _mi_destroy>& x) mi_attr_noexcept : heap(x.heap) { }
+
+private:
+  static void heap_delete(mi_heap_t* hp)  { if (hp != NULL) { mi_heap_delete(hp); } }
+  static void heap_destroy(mi_heap_t* hp) { if (hp != NULL) { mi_heap_destroy(hp); } }
+};
+
+// STL allocator allocation in a specific heap
+template<class T> struct mi_heap_stl_allocator : public _mi_heap_stl_allocator_common<T, false> {
+  using typename _mi_heap_stl_allocator_common<T, false>::size_type;
+  mi_heap_stl_allocator() : _mi_heap_stl_allocator_common<T, false>() { } // creates fresh heap that is deleted when the destructor is called
+  mi_heap_stl_allocator(mi_heap_t* hp) : _mi_heap_stl_allocator_common<T, false>(hp) { }  // no delete nor destroy on the passed in heap 
+  template<class U> mi_heap_stl_allocator(const mi_heap_stl_allocator<U>& x) mi_attr_noexcept : _mi_heap_stl_allocator_common<T, false>(x) { }
+
+  mi_heap_stl_allocator select_on_container_copy_construction() const { return *this; }
+  void deallocate(T* p, size_type) { mi_free(p); }
+  template<class U> struct rebind { typedef mi_heap_stl_allocator<U> other; };
+};
+
+template<class T1, class T2> bool operator==(const mi_heap_stl_allocator<T1>& x, const mi_heap_stl_allocator<T2>& y) mi_attr_noexcept { return (x.is_equal(y)); }
+template<class T1, class T2> bool operator!=(const mi_heap_stl_allocator<T1>& x, const mi_heap_stl_allocator<T2>& y) mi_attr_noexcept { return (!x.is_equal(y)); }
+
+
+// STL allocator allocation in a specific heap, where `free` does nothing and
+// the heap is destroyed in one go on destruction -- use with care!
+template<class T> struct mi_heap_destroy_stl_allocator : public _mi_heap_stl_allocator_common<T, true> {
+  using typename _mi_heap_stl_allocator_common<T, true>::size_type;
+  mi_heap_destroy_stl_allocator() : _mi_heap_stl_allocator_common<T, true>() { } // creates fresh heap that is destroyed when the destructor is called
+  mi_heap_destroy_stl_allocator(mi_heap_t* hp) : _mi_heap_stl_allocator_common<T, true>(hp) { }  // no delete nor destroy on the passed in heap 
+  template<class U> mi_heap_destroy_stl_allocator(const mi_heap_destroy_stl_allocator<U>& x) mi_attr_noexcept : _mi_heap_stl_allocator_common<T, true>(x) { }
+
+  mi_heap_destroy_stl_allocator select_on_container_copy_construction() const { return *this; }
+  void deallocate(T*, size_type) { /* do nothing as we destroy the heap on destruct. */ }
+  template<class U> struct rebind { typedef mi_heap_destroy_stl_allocator<U> other; };
+};
+
+template<class T1, class T2> bool operator==(const mi_heap_destroy_stl_allocator<T1>& x, const mi_heap_destroy_stl_allocator<T2>& y) mi_attr_noexcept { return (x.is_equal(y)); }
+template<class T1, class T2> bool operator!=(const mi_heap_destroy_stl_allocator<T1>& x, const mi_heap_destroy_stl_allocator<T2>& y) mi_attr_noexcept { return (!x.is_equal(y)); }
+
+#endif // C++11
+
 #endif // __cplusplus
 
 #endif
diff --git a/contrib/libs/mimalloc/include/mimalloc-atomic.h b/contrib/libs/mimalloc/include/mimalloc/atomic.h
similarity index 83%
rename from contrib/libs/mimalloc/include/mimalloc-atomic.h
rename to contrib/libs/mimalloc/include/mimalloc/atomic.h
index f7cac357e28b..38f174e45466 100644
--- a/contrib/libs/mimalloc/include/mimalloc-atomic.h
+++ b/contrib/libs/mimalloc/include/mimalloc/atomic.h
@@ -1,5 +1,5 @@
 /* ----------------------------------------------------------------------------
-Copyright (c) 2018-2021 Microsoft Research, Daan Leijen
+Copyright (c) 2018-2023 Microsoft Research, Daan Leijen
 This is free software; you can redistribute it and/or modify it under the
 terms of the MIT license. A copy of the license can be found in the file
 "LICENSE" at the root of this distribution.
@@ -11,9 +11,9 @@ terms of the MIT license. A copy of the license can be found in the file
 // --------------------------------------------------------------------------------------------
 // Atomics
 // We need to be portable between C, C++, and MSVC.
-// We base the primitives on the C/C++ atomics and create a mimimal wrapper for MSVC in C compilation mode. 
-// This is why we try to use only `uintptr_t` and `<type>*` as atomic types. 
-// To gain better insight in the range of used atomics, we use explicitly named memory order operations 
+// We base the primitives on the C/C++ atomics and create a mimimal wrapper for MSVC in C compilation mode.
+// This is why we try to use only `uintptr_t` and `<type>*` as atomic types.
+// To gain better insight in the range of used atomics, we use explicitly named memory order operations
 // instead of passing the memory order as a parameter.
 // -----------------------------------------------------------------------------------------------
 
@@ -23,10 +23,17 @@ terms of the MIT license. A copy of the license can be found in the file
 #define  _Atomic(tp)            std::atomic<tp>
 #define  mi_atomic(name)        std::atomic_##name
 #define  mi_memory_order(name)  std::memory_order_##name
+#if (__cplusplus >= 202002L)    // c++20, see issue #571
+#define MI_ATOMIC_VAR_INIT(x)  x
+#elif !defined(ATOMIC_VAR_INIT)
+#define MI_ATOMIC_VAR_INIT(x)  x
+#else
+ #define MI_ATOMIC_VAR_INIT(x)  ATOMIC_VAR_INIT(x)
+#endif
 #elif defined(_MSC_VER)
 // Use MSVC C wrapper for C11 atomics
 #define  _Atomic(tp)            tp
-#define  ATOMIC_VAR_INIT(x)     x
+#define  MI_ATOMIC_VAR_INIT(x)  x
 #define  mi_atomic(name)        mi_atomic_##name
 #define  mi_memory_order(name)  mi_memory_order_##name
 #else
@@ -34,6 +41,13 @@ terms of the MIT license. A copy of the license can be found in the file
 #include <stdatomic.h>
 #define  mi_atomic(name)        atomic_##name
 #define  mi_memory_order(name)  memory_order_##name
+#if (__STDC_VERSION__ >= 201710L) // c17, see issue #735
+ #define MI_ATOMIC_VAR_INIT(x) x
+#elif !defined(ATOMIC_VAR_INIT)
+ #define MI_ATOMIC_VAR_INIT(x) x
+#else
+ #define MI_ATOMIC_VAR_INIT(x) ATOMIC_VAR_INIT(x)
+#endif
 #endif
 
 // Various defines for all used memory orders in mimalloc
@@ -107,17 +121,21 @@ static inline void mi_atomic_maxi64_relaxed(volatile int64_t* p, int64_t x) {
 }
 
 // Used by timers
-#define mi_atomic_loadi64_acquire(p)    mi_atomic(load_explicit)(p,mi_memory_order(acquire))
-#define mi_atomic_loadi64_relaxed(p)    mi_atomic(load_explicit)(p,mi_memory_order(relaxed))
-#define mi_atomic_storei64_release(p,x) mi_atomic(store_explicit)(p,x,mi_memory_order(release))
-#define mi_atomic_storei64_relaxed(p,x) mi_atomic(store_explicit)(p,x,mi_memory_order(relaxed))
+#define mi_atomic_loadi64_acquire(p)            mi_atomic(load_explicit)(p,mi_memory_order(acquire))
+#define mi_atomic_loadi64_relaxed(p)            mi_atomic(load_explicit)(p,mi_memory_order(relaxed))
+#define mi_atomic_storei64_release(p,x)         mi_atomic(store_explicit)(p,x,mi_memory_order(release))
+#define mi_atomic_storei64_relaxed(p,x)         mi_atomic(store_explicit)(p,x,mi_memory_order(relaxed))
 
+#define mi_atomic_casi64_strong_acq_rel(p,e,d)  mi_atomic_cas_strong_acq_rel(p,e,d)
+#define mi_atomic_addi64_acq_rel(p,i)           mi_atomic_add_acq_rel(p,i)
 
 
 #elif defined(_MSC_VER)
 
-// MSVC C compilation wrapper that uses Interlocked operations to model C11 atomics.
+// Legacy MSVC plain C compilation wrapper that uses Interlocked operations to model C11 atomics.
+#ifndef WIN32_LEAN_AND_MEAN
 #define WIN32_LEAN_AND_MEAN
+#endif
 #include <windows.h>
 #include <intrin.h>
 #ifdef _WIN64
@@ -173,7 +191,7 @@ static inline uintptr_t mi_atomic_exchange_explicit(_Atomic(uintptr_t)*p, uintpt
 }
 static inline void mi_atomic_thread_fence(mi_memory_order mo) {
   (void)(mo);
-  _Atomic(uintptr_t)x = 0;
+  _Atomic(uintptr_t) x = 0;
   mi_atomic_exchange_explicit(&x, 1, mo);
 }
 static inline uintptr_t mi_atomic_load_explicit(_Atomic(uintptr_t) const* p, mi_memory_order mo) {
@@ -183,7 +201,7 @@ static inline uintptr_t mi_atomic_load_explicit(_Atomic(uintptr_t) const* p, mi_
 #else
   uintptr_t x = *p;
   if (mo > mi_memory_order_relaxed) {
-    while (!mi_atomic_compare_exchange_weak_explicit(p, &x, x, mo, mi_memory_order_relaxed)) { /* nothing */ };
+    while (!mi_atomic_compare_exchange_weak_explicit((_Atomic(uintptr_t)*)p, &x, x, mo, mi_memory_order_relaxed)) { /* nothing */ };
   }
   return x;
 #endif
@@ -239,6 +257,21 @@ static inline void mi_atomic_maxi64_relaxed(volatile _Atomic(int64_t)*p, int64_t
   } while (current < x && _InterlockedCompareExchange64(p, x, current) != current);
 }
 
+static inline void mi_atomic_addi64_acq_rel(volatile _Atomic(int64_t*)p, int64_t i) {
+  mi_atomic_addi64_relaxed(p, i);
+}
+
+static inline bool mi_atomic_casi64_strong_acq_rel(volatile _Atomic(int64_t*)p, int64_t* exp, int64_t des) {
+  int64_t read = _InterlockedCompareExchange64(p, des, *exp);
+  if (read == *exp) {
+    return true;
+  }
+  else {
+    *exp = read;
+    return false;
+  }
+}
+
 // The pointer macros cast to `uintptr_t`.
 #define mi_atomic_load_ptr_acquire(tp,p)                (tp*)mi_atomic_load_acquire((_Atomic(uintptr_t)*)(p))
 #define mi_atomic_load_ptr_relaxed(tp,p)                (tp*)mi_atomic_load_relaxed((_Atomic(uintptr_t)*)(p))
@@ -269,14 +302,36 @@ static inline intptr_t mi_atomic_subi(_Atomic(intptr_t)*p, intptr_t sub) {
   return (intptr_t)mi_atomic_addi(p, -sub);
 }
 
-// Yield 
+typedef _Atomic(uintptr_t) mi_atomic_once_t;
+
+// Returns true only on the first invocation
+static inline bool mi_atomic_once( mi_atomic_once_t* once ) {
+  if (mi_atomic_load_relaxed(once) != 0) return false;     // quick test 
+  uintptr_t expected = 0;
+  return mi_atomic_cas_strong_acq_rel(once, &expected, (uintptr_t)1); // try to set to 1
+}
+
+typedef _Atomic(uintptr_t) mi_atomic_guard_t;
+
+// Allows only one thread to execute at a time
+#define mi_atomic_guard(guard) \
+  uintptr_t _mi_guard_expected = 0; \
+  for(bool _mi_guard_once = true; \
+      _mi_guard_once && mi_atomic_cas_strong_acq_rel(guard,&_mi_guard_expected,(uintptr_t)1); \
+      (mi_atomic_store_release(guard,(uintptr_t)0), _mi_guard_once = false) )
+
+
+
+// Yield
 #if defined(__cplusplus)
 #include <thread>
 static inline void mi_atomic_yield(void) {
   std::this_thread::yield();
 }
 #elif defined(_WIN32)
+#ifndef WIN32_LEAN_AND_MEAN
 #define WIN32_LEAN_AND_MEAN
+#endif
 #include <windows.h>
 static inline void mi_atomic_yield(void) {
   YieldProcessor();
@@ -288,7 +343,7 @@ static inline void mi_atomic_yield(void) {
 }
 #elif (defined(__GNUC__) || defined(__clang__)) && \
       (defined(__x86_64__) || defined(__i386__) || defined(__arm__) || defined(__armel__) || defined(__ARMEL__) || \
-       defined(__aarch64__) || defined(__powerpc__) || defined(__ppc__) || defined(__PPC__))
+       defined(__aarch64__) || defined(__powerpc__) || defined(__ppc__) || defined(__PPC__)) || defined(__POWERPC__)
 #if defined(__x86_64__) || defined(__i386__)
 static inline void mi_atomic_yield(void) {
   __asm__ volatile ("pause" ::: "memory");
@@ -301,10 +356,16 @@ static inline void mi_atomic_yield(void) {
 static inline void mi_atomic_yield(void) {
   __asm__ volatile("yield" ::: "memory");
 }
-#elif defined(__powerpc__) || defined(__ppc__) || defined(__PPC__)
+#elif defined(__powerpc__) || defined(__ppc__) || defined(__PPC__) || defined(__POWERPC__)
+#ifdef __APPLE__
+static inline void mi_atomic_yield(void) {
+  __asm__ volatile ("or r27,r27,r27" ::: "memory");
+}
+#else
 static inline void mi_atomic_yield(void) {
   __asm__ __volatile__ ("or 27,27,27" ::: "memory");
 }
+#endif
 #elif defined(__armel__) || defined(__ARMEL__)
 static inline void mi_atomic_yield(void) {
   __asm__ volatile ("nop" ::: "memory");
diff --git a/contrib/libs/mimalloc/include/mimalloc-internal.h b/contrib/libs/mimalloc/include/mimalloc/internal.h
similarity index 64%
rename from contrib/libs/mimalloc/include/mimalloc-internal.h
rename to contrib/libs/mimalloc/include/mimalloc/internal.h
index 1e1a79665c57..2954eabd86ca 100644
--- a/contrib/libs/mimalloc/include/mimalloc-internal.h
+++ b/contrib/libs/mimalloc/include/mimalloc/internal.h
@@ -1,5 +1,5 @@
 /* ----------------------------------------------------------------------------
-Copyright (c) 2018-2021, Microsoft Research, Daan Leijen
+Copyright (c) 2018-2023, Microsoft Research, Daan Leijen
 This is free software; you can redistribute it and/or modify it under the
 terms of the MIT license. A copy of the license can be found in the file
 "LICENSE" at the root of this distribution.
@@ -8,7 +8,14 @@ terms of the MIT license. A copy of the license can be found in the file
 #ifndef MIMALLOC_INTERNAL_H
 #define MIMALLOC_INTERNAL_H
 
-#include "mimalloc-types.h"
+
+// --------------------------------------------------------------------------
+// This file contains the interal API's of mimalloc and various utility
+// functions and macros.
+// --------------------------------------------------------------------------
+
+#include "types.h"
+#include "track.h"
 
 #if (MI_DEBUG>0)
 #define mi_trace_message(...)  _mi_trace_message(__VA_ARGS__)
@@ -19,17 +26,37 @@ terms of the MIT license. A copy of the license can be found in the file
 #define MI_CACHE_LINE          64
 #if defined(_MSC_VER)
 #pragma warning(disable:4127)   // suppress constant conditional warning (due to MI_SECURE paths)
+#pragma warning(disable:26812)  // unscoped enum warning
 #define mi_decl_noinline        __declspec(noinline)
 #define mi_decl_thread          __declspec(thread)
 #define mi_decl_cache_align     __declspec(align(MI_CACHE_LINE))
-#elif (defined(__GNUC__) && (__GNUC__>=3))  // includes clang and icc
+#define mi_decl_weak
+#elif (defined(__GNUC__) && (__GNUC__ >= 3)) || defined(__clang__) // includes clang and icc
 #define mi_decl_noinline        __attribute__((noinline))
 #define mi_decl_thread          __thread
 #define mi_decl_cache_align     __attribute__((aligned(MI_CACHE_LINE)))
+#define mi_decl_weak            __attribute__((weak))
 #else
 #define mi_decl_noinline
 #define mi_decl_thread          __thread        // hope for the best :-)
 #define mi_decl_cache_align
+#define mi_decl_weak
+#endif
+
+#if defined(__EMSCRIPTEN__) && !defined(__wasi__)
+#define __wasi__
+#endif
+
+#if defined(__cplusplus)
+#define mi_decl_externc       extern "C"
+#else
+#define mi_decl_externc
+#endif
+
+// pthreads
+#if !defined(_WIN32) && !defined(__wasi__)
+#define  MI_USE_PTHREADS
+#include <pthread.h>
 #endif
 
 // "options.c"
@@ -43,61 +70,110 @@ void       _mi_error_message(int err, const char* fmt, ...);
 
 // random.c
 void       _mi_random_init(mi_random_ctx_t* ctx);
+void       _mi_random_init_weak(mi_random_ctx_t* ctx);
+void       _mi_random_reinit_if_weak(mi_random_ctx_t * ctx);
 void       _mi_random_split(mi_random_ctx_t* ctx, mi_random_ctx_t* new_ctx);
 uintptr_t  _mi_random_next(mi_random_ctx_t* ctx);
 uintptr_t  _mi_heap_random_next(mi_heap_t* heap);
-uintptr_t  _os_random_weak(uintptr_t extra_seed);
+uintptr_t  _mi_os_random_weak(uintptr_t extra_seed);
 static inline uintptr_t _mi_random_shuffle(uintptr_t x);
 
 // init.c
 extern mi_decl_cache_align mi_stats_t       _mi_stats_main;
 extern mi_decl_cache_align const mi_page_t  _mi_page_empty;
 bool       _mi_is_main_thread(void);
-bool       _mi_preloading();  // true while the C runtime is not ready
+size_t     _mi_current_thread_count(void);
+bool       _mi_preloading(void);           // true while the C runtime is not initialized yet
+mi_threadid_t _mi_thread_id(void) mi_attr_noexcept;
+mi_heap_t*    _mi_heap_main_get(void);     // statically allocated main backing heap
+void       _mi_thread_done(mi_heap_t* heap);
+void       _mi_thread_data_collect(void);
+void       _mi_tld_init(mi_tld_t* tld, mi_heap_t* bheap);
 
 // os.c
+void       _mi_os_init(void);                                            // called from process init
+void*      _mi_os_alloc(size_t size, mi_memid_t* memid, mi_stats_t* stats);
+void       _mi_os_free(void* p, size_t size, mi_memid_t memid, mi_stats_t* stats);
+void       _mi_os_free_ex(void* p, size_t size, bool still_committed, mi_memid_t memid, mi_stats_t* stats);
+
 size_t     _mi_os_page_size(void);
-void       _mi_os_init(void);                                      // called from process init
-void*      _mi_os_alloc(size_t size, mi_stats_t* stats);           // to allocate thread local data
-void       _mi_os_free(void* p, size_t size, mi_stats_t* stats);   // to free thread local data
 size_t     _mi_os_good_alloc_size(size_t size);
-
-// memory.c
-void*      _mi_mem_alloc_aligned(size_t size, size_t alignment, bool* commit, bool* large, bool* is_pinned, bool* is_zero, size_t* id, mi_os_tld_t* tld);
-void       _mi_mem_free(void* p, size_t size, size_t id, bool fully_committed, bool any_reset, mi_os_tld_t* tld);
-
-bool       _mi_mem_reset(void* p, size_t size, mi_os_tld_t* tld);
-bool       _mi_mem_unreset(void* p, size_t size, bool* is_zero, mi_os_tld_t* tld);
-bool       _mi_mem_commit(void* p, size_t size, bool* is_zero, mi_os_tld_t* tld);
-bool       _mi_mem_protect(void* addr, size_t size);
-bool       _mi_mem_unprotect(void* addr, size_t size);
-
-void        _mi_mem_collect(mi_os_tld_t* tld);
+bool       _mi_os_has_overcommit(void);
+bool       _mi_os_has_virtual_reserve(void);
+
+bool       _mi_os_reset(void* addr, size_t size, mi_stats_t* tld_stats);
+bool       _mi_os_commit(void* p, size_t size, bool* is_zero, mi_stats_t* stats);
+bool       _mi_os_decommit(void* addr, size_t size, mi_stats_t* stats);
+bool       _mi_os_protect(void* addr, size_t size);
+bool       _mi_os_unprotect(void* addr, size_t size);
+bool       _mi_os_purge(void* p, size_t size, mi_stats_t* stats);
+bool       _mi_os_purge_ex(void* p, size_t size, bool allow_reset, mi_stats_t* stats);
+
+void*      _mi_os_alloc_aligned(size_t size, size_t alignment, bool commit, bool allow_large, mi_memid_t* memid, mi_stats_t* stats);
+void*      _mi_os_alloc_aligned_at_offset(size_t size, size_t alignment, size_t align_offset, bool commit, bool allow_large, mi_memid_t* memid, mi_stats_t* tld_stats);
+
+void*      _mi_os_get_aligned_hint(size_t try_alignment, size_t size);
+bool       _mi_os_use_large_page(size_t size, size_t alignment);
+size_t     _mi_os_large_page_size(void);
+
+void*      _mi_os_alloc_huge_os_pages(size_t pages, int numa_node, mi_msecs_t max_secs, size_t* pages_reserved, size_t* psize, mi_memid_t* memid);
+
+// arena.c
+mi_arena_id_t _mi_arena_id_none(void);
+void       _mi_arena_free(void* p, size_t size, size_t still_committed_size, mi_memid_t memid, mi_stats_t* stats);
+void*      _mi_arena_alloc(size_t size, bool commit, bool allow_large, mi_arena_id_t req_arena_id, mi_memid_t* memid, mi_os_tld_t* tld);
+void*      _mi_arena_alloc_aligned(size_t size, size_t alignment, size_t align_offset, bool commit, bool allow_large, mi_arena_id_t req_arena_id, mi_memid_t* memid, mi_os_tld_t* tld);
+bool       _mi_arena_memid_is_suitable(mi_memid_t memid, mi_arena_id_t request_arena_id);
+bool       _mi_arena_contains(const void* p);
+void       _mi_arenas_collect(bool force_purge, mi_stats_t* stats);
+void       _mi_arena_unsafe_destroy_all(mi_stats_t* stats);
+
+bool       _mi_arena_segment_clear_abandoned(mi_segment_t* segment);
+void       _mi_arena_segment_mark_abandoned(mi_segment_t* segment);
+size_t     _mi_arena_segment_abandoned_count(void);
+
+typedef struct mi_arena_field_cursor_s { // abstract
+  mi_arena_id_t  start;
+  int            count;
+  size_t         bitmap_idx;
+} mi_arena_field_cursor_t;
+void          _mi_arena_field_cursor_init(mi_heap_t* heap, mi_arena_field_cursor_t* current);
+mi_segment_t* _mi_arena_segment_clear_abandoned_next(mi_arena_field_cursor_t* previous);
+
+// "segment-map.c"
+void       _mi_segment_map_allocated_at(const mi_segment_t* segment);
+void       _mi_segment_map_freed_at(const mi_segment_t* segment);
 
 // "segment.c"
-mi_page_t* _mi_segment_page_alloc(mi_heap_t* heap, size_t block_wsize, mi_segments_tld_t* tld, mi_os_tld_t* os_tld);
+mi_page_t* _mi_segment_page_alloc(mi_heap_t* heap, size_t block_size, size_t page_alignment, mi_segments_tld_t* tld, mi_os_tld_t* os_tld);
 void       _mi_segment_page_free(mi_page_t* page, bool force, mi_segments_tld_t* tld);
 void       _mi_segment_page_abandon(mi_page_t* page, mi_segments_tld_t* tld);
-uint8_t*   _mi_segment_page_start(const mi_segment_t* segment, const mi_page_t* page, size_t block_size, size_t* page_size, size_t* pre_size); // page start for any page
+uint8_t*   _mi_segment_page_start(const mi_segment_t* segment, const mi_page_t* page, size_t* page_size);
+
+#if MI_HUGE_PAGE_ABANDON
 void       _mi_segment_huge_page_free(mi_segment_t* segment, mi_page_t* page, mi_block_t* block);
+#else
+void       _mi_segment_huge_page_reset(mi_segment_t* segment, mi_page_t* page, mi_block_t* block);
+#endif
 
-void       _mi_segment_thread_collect(mi_segments_tld_t* tld);
+void       _mi_segments_collect(bool force, mi_segments_tld_t* tld);
 void       _mi_abandoned_reclaim_all(mi_heap_t* heap, mi_segments_tld_t* tld);
 void       _mi_abandoned_await_readers(void);
-
-
+bool       _mi_segment_attempt_reclaim(mi_heap_t* heap, mi_segment_t* segment);
 
 // "page.c"
-void*      _mi_malloc_generic(mi_heap_t* heap, size_t size)  mi_attr_noexcept mi_attr_malloc;
+void*      _mi_malloc_generic(mi_heap_t* heap, size_t size, bool zero, size_t huge_alignment)  mi_attr_noexcept mi_attr_malloc;
 
-void       _mi_page_retire(mi_page_t* page);                                  // free the page if there are no other pages with many free blocks
+void       _mi_page_retire(mi_page_t* page) mi_attr_noexcept;                  // free the page if there are no other pages with many free blocks
 void       _mi_page_unfull(mi_page_t* page);
 void       _mi_page_free(mi_page_t* page, mi_page_queue_t* pq, bool force);   // free the page
 void       _mi_page_abandon(mi_page_t* page, mi_page_queue_t* pq);            // abandon the page, to be picked up by another thread...
-void       _mi_heap_delayed_free(mi_heap_t* heap);
+void       _mi_heap_delayed_free_all(mi_heap_t* heap);
+bool       _mi_heap_delayed_free_partial(mi_heap_t* heap);
 void       _mi_heap_collect_retired(mi_heap_t* heap, bool force);
 
 void       _mi_page_use_delayed_free(mi_page_t* page, mi_delayed_t delay, bool override_never);
+bool       _mi_page_try_use_delayed_free(mi_page_t* page, mi_delayed_t delay, bool override_never);
 size_t     _mi_page_queue_append(mi_heap_t* heap, mi_page_queue_t* pq, mi_page_queue_t* append);
 void       _mi_deferred_free(mi_heap_t* heap, bool force);
 
@@ -108,24 +184,43 @@ size_t     _mi_bin_size(uint8_t bin);           // for stats
 uint8_t    _mi_bin(size_t size);                // for stats
 
 // "heap.c"
+void       _mi_heap_init(mi_heap_t* heap, mi_tld_t* tld, mi_arena_id_t arena_id, bool noreclaim, uint8_t tag);
 void       _mi_heap_destroy_pages(mi_heap_t* heap);
 void       _mi_heap_collect_abandon(mi_heap_t* heap);
 void       _mi_heap_set_default_direct(mi_heap_t* heap);
+bool       _mi_heap_memid_is_suitable(mi_heap_t* heap, mi_memid_t memid);
+void       _mi_heap_unsafe_destroy_all(void);
+mi_heap_t* _mi_heap_by_tag(mi_heap_t* heap, uint8_t tag);
 
 // "stats.c"
 void       _mi_stats_done(mi_stats_t* stats);
-
 mi_msecs_t  _mi_clock_now(void);
 mi_msecs_t  _mi_clock_end(mi_msecs_t start);
 mi_msecs_t  _mi_clock_start(void);
 
 // "alloc.c"
-void*       _mi_page_malloc(mi_heap_t* heap, mi_page_t* page, size_t size) mi_attr_noexcept;  // called from `_mi_malloc_generic`
-void*       _mi_heap_malloc_zero(mi_heap_t* heap, size_t size, bool zero);
-void*       _mi_heap_realloc_zero(mi_heap_t* heap, void* p, size_t newsize, bool zero);
-mi_block_t* _mi_page_ptr_unalign(const mi_segment_t* segment, const mi_page_t* page, const void* p);
+void*       _mi_page_malloc_zero(mi_heap_t* heap, mi_page_t* page, size_t size, bool zero) mi_attr_noexcept;  // called from `_mi_malloc_generic`
+void*       _mi_page_malloc(mi_heap_t* heap, mi_page_t* page, size_t size) mi_attr_noexcept;                  // called from `_mi_heap_malloc_aligned`
+void*       _mi_page_malloc_zeroed(mi_heap_t* heap, mi_page_t* page, size_t size) mi_attr_noexcept;           // called from `_mi_heap_malloc_aligned`
+void*       _mi_heap_malloc_zero(mi_heap_t* heap, size_t size, bool zero) mi_attr_noexcept;
+void*       _mi_heap_malloc_zero_ex(mi_heap_t* heap, size_t size, bool zero, size_t huge_alignment) mi_attr_noexcept;     // called from `_mi_heap_malloc_aligned`
+void*       _mi_heap_realloc_zero(mi_heap_t* heap, void* p, size_t newsize, bool zero) mi_attr_noexcept;
+mi_block_t* _mi_page_ptr_unalign(const mi_page_t* page, const void* p);
 bool        _mi_free_delayed_block(mi_block_t* block);
-void        _mi_block_zero_init(const mi_page_t* page, void* p, size_t size);
+void        _mi_free_generic(mi_segment_t* segment, mi_page_t* page, bool is_local, void* p) mi_attr_noexcept;  // for runtime integration
+void        _mi_padding_shrink(const mi_page_t* page, const mi_block_t* block, const size_t min_size);
+
+// "libc.c"
+#include    <stdarg.h>
+void        _mi_vsnprintf(char* buf, size_t bufsize, const char* fmt, va_list args);
+void        _mi_snprintf(char* buf, size_t buflen, const char* fmt, ...);
+char        _mi_toupper(char c);
+int         _mi_strnicmp(const char* s, const char* t, size_t n);
+void        _mi_strlcpy(char* dest, const char* src, size_t dest_size);
+void        _mi_strlcat(char* dest, const char* src, size_t dest_size);
+size_t      _mi_strlen(const char* s);
+size_t      _mi_strnlen(const char* s, size_t max_len);
+bool        _mi_getenv(const char* name, char* result, size_t result_size);
 
 #if MI_DEBUG>1
 bool        _mi_page_is_valid(mi_page_t* page);
@@ -137,8 +232,11 @@ bool        _mi_page_is_valid(mi_page_t* page);
 // ------------------------------------------------------
 
 #if defined(__GNUC__) || defined(__clang__)
-#define mi_unlikely(x)     __builtin_expect((x),0)
-#define mi_likely(x)       __builtin_expect((x),1)
+#define mi_unlikely(x)     (__builtin_expect(!!(x),false))
+#define mi_likely(x)       (__builtin_expect(!!(x),true))
+#elif (defined(__cplusplus) && (__cplusplus >= 202002L)) || (defined(_MSVC_LANG) && _MSVC_LANG >= 202002L)
+#define mi_unlikely(x)     (x) [[unlikely]]
+#define mi_likely(x)       (x) [[likely]]
 #else
 #define mi_unlikely(x)     (x)
 #define mi_likely(x)       (x)
@@ -176,11 +274,11 @@ bool        _mi_page_is_valid(mi_page_t* page);
 /* -----------------------------------------------------------
   Inlined definitions
 ----------------------------------------------------------- */
-#define UNUSED(x)     (void)(x)
+#define MI_UNUSED(x)     (void)(x)
 #if (MI_DEBUG>0)
-#define UNUSED_RELEASE(x)
+#define MI_UNUSED_RELEASE(x)
 #else
-#define UNUSED_RELEASE(x)  UNUSED(x)
+#define MI_UNUSED_RELEASE(x)  MI_UNUSED(x)
 #endif
 
 #define MI_INIT4(x)   x(),x(),x(),x()
@@ -192,11 +290,21 @@ bool        _mi_page_is_valid(mi_page_t* page);
 #define MI_INIT256(x) MI_INIT128(x),MI_INIT128(x)
 
 
+#include <string.h>
+// initialize a local variable to zero; use memset as compilers optimize constant sized memset's
+#define _mi_memzero_var(x)  memset(&x,0,sizeof(x))
+
 // Is `x` a power of two? (0 is considered a power of two)
 static inline bool _mi_is_power_of_two(uintptr_t x) {
   return ((x & (x - 1)) == 0);
 }
 
+// Is a pointer aligned?
+static inline bool _mi_is_aligned(void* p, size_t alignment) {
+  mi_assert_internal(alignment != 0);
+  return (((uintptr_t)p % alignment) == 0);
+}
+
 // Align upwards
 static inline uintptr_t _mi_align_up(uintptr_t sz, size_t alignment) {
   mi_assert_internal(alignment != 0);
@@ -209,6 +317,12 @@ static inline uintptr_t _mi_align_up(uintptr_t sz, size_t alignment) {
   }
 }
 
+// Align a pointer upwards
+static inline void* mi_align_up_ptr(void* p, size_t alignment) {
+  return (void*)_mi_align_up((uintptr_t)p, alignment);
+}
+
+
 // Divide upwards: `s <= _mi_divide_up(s,d)*d < s+d`.
 static inline uintptr_t _mi_divide_up(uintptr_t size, size_t divider) {
   mi_assert_internal(divider != 0);
@@ -216,7 +330,7 @@ static inline uintptr_t _mi_divide_up(uintptr_t size, size_t divider) {
 }
 
 // Is memory zero initialized?
-static inline bool mi_mem_is_zero(void* p, size_t size) {
+static inline bool mi_mem_is_zero(const void* p, size_t size) {
   for (size_t i = 0; i < size; i++) {
     if (((uint8_t*)p)[i] != 0) return false;
   }
@@ -230,32 +344,27 @@ static inline size_t _mi_wsize_from_size(size_t size) {
   return (size + sizeof(uintptr_t) - 1) / sizeof(uintptr_t);
 }
 
-// Does malloc satisfy the alignment constraints already?
-static inline bool mi_malloc_satisfies_alignment(size_t alignment, size_t size) {
-  return (alignment == sizeof(void*) || (alignment == MI_MAX_ALIGN_SIZE && size > (MI_MAX_ALIGN_SIZE/2)));
-}
-
 // Overflow detecting multiply
-#if __has_builtin(__builtin_umul_overflow) || __GNUC__ >= 5
+#if __has_builtin(__builtin_umul_overflow) || (defined(__GNUC__) && (__GNUC__ >= 5))
 #include <limits.h>      // UINT_MAX, ULONG_MAX
 #if defined(_CLOCK_T)    // for Illumos
 #undef _CLOCK_T
 #endif
 static inline bool mi_mul_overflow(size_t count, size_t size, size_t* total) {
-  #if (SIZE_MAX == UINT_MAX)
-    return __builtin_umul_overflow(count, size, total);
-  #elif (SIZE_MAX == ULONG_MAX)
-    return __builtin_umull_overflow(count, size, total);
+  #if (SIZE_MAX == ULONG_MAX)
+    return __builtin_umull_overflow(count, size, (unsigned long *)total);
+  #elif (SIZE_MAX == UINT_MAX)
+    return __builtin_umul_overflow(count, size, (unsigned int *)total);
   #else
-    return __builtin_umulll_overflow(count, size, total);
+    return __builtin_umulll_overflow(count, size, (unsigned long long *)total);
   #endif
 }
 #else /* __builtin_umul_overflow is unavailable */
 static inline bool mi_mul_overflow(size_t count, size_t size, size_t* total) {
-  #define MI_MUL_NO_OVERFLOW ((size_t)1 << (4*sizeof(size_t)))  // sqrt(SIZE_MAX)
+  #define MI_MUL_COULD_OVERFLOW ((size_t)1 << (4*sizeof(size_t)))  // sqrt(SIZE_MAX)
   *total = count * size;
-  return ((size >= MI_MUL_NO_OVERFLOW || count >= MI_MUL_NO_OVERFLOW)
-    && size > 0 && (SIZE_MAX / size) < count);
+  // note: gcc/clang optimize this to directly check the overflow flag
+  return ((size >= MI_MUL_COULD_OVERFLOW || count >= MI_MUL_COULD_OVERFLOW) && size > 0 && (SIZE_MAX / size) < count);
 }
 #endif
 
@@ -265,8 +374,10 @@ static inline bool mi_count_size_overflow(size_t count, size_t size, size_t* tot
     *total = size;
     return false;
   }
-  else if (mi_unlikely(mi_mul_overflow(count, size, total))) {
+  else if mi_unlikely(mi_mul_overflow(count, size, total)) {
+    #if MI_DEBUG > 0
     _mi_error_message(EOVERFLOW, "allocation request is too large (%zu * %zu bytes)\n", count, size);
+    #endif
     *total = SIZE_MAX;
     return true;
   }
@@ -274,85 +385,11 @@ static inline bool mi_count_size_overflow(size_t count, size_t size, size_t* tot
 }
 
 
-/* ----------------------------------------------------------------------------------------
-The thread local default heap: `_mi_get_default_heap` returns the thread local heap.
-On most platforms (Windows, Linux, FreeBSD, NetBSD, etc), this just returns a
-__thread local variable (`_mi_heap_default`). With the initial-exec TLS model this ensures
-that the storage will always be available (allocated on the thread stacks).
-On some platforms though we cannot use that when overriding `malloc` since the underlying
-TLS implementation (or the loader) will call itself `malloc` on a first access and recurse.
-We try to circumvent this in an efficient way:
-- macOSX : we use an unused TLS slot from the OS allocated slots (MI_TLS_SLOT). On OSX, the
-           loader itself calls `malloc` even before the modules are initialized.
-- OpenBSD: we use an unused slot from the pthread block (MI_TLS_PTHREAD_SLOT_OFS).
-- DragonFly: the uniqueid use is buggy but kept for reference.
+/*----------------------------------------------------------------------------------------
+  Heap functions
 ------------------------------------------------------------------------------------------- */
 
 extern const mi_heap_t _mi_heap_empty;  // read-only empty heap, initial value of the thread local default heap
-extern bool _mi_process_is_initialized;
-mi_heap_t*  _mi_heap_main_get(void);    // statically allocated main backing heap
-
-#if defined(MI_MALLOC_OVERRIDE)
-#if defined(__APPLE__) // macOS
-#define MI_TLS_SLOT               89  // seems unused? 
-// other possible unused ones are 9, 29, __PTK_FRAMEWORK_JAVASCRIPTCORE_KEY4 (94), __PTK_FRAMEWORK_GC_KEY9 (112) and __PTK_FRAMEWORK_OLDGC_KEY9 (89)
-// see <https://github.com/rweichler/substrate/blob/master/include/pthread_machdep.h>
-#elif defined(__OpenBSD__)
-// use end bytes of a name; goes wrong if anyone uses names > 23 characters (ptrhread specifies 16) 
-// see <https://github.com/openbsd/src/blob/master/lib/libc/include/thread_private.h#L371>
-#define MI_TLS_PTHREAD_SLOT_OFS   (6*sizeof(int) + 4*sizeof(void*) + 24)  
-#elif defined(__DragonFly__)
-#warning "mimalloc is not working correctly on DragonFly yet."
-//#define MI_TLS_PTHREAD_SLOT_OFS   (4 + 1*sizeof(void*))  // offset `uniqueid` (also used by gdb?) <https://github.com/DragonFlyBSD/DragonFlyBSD/blob/master/lib/libthread_xu/thread/thr_private.h#L458>
-#endif
-#endif
-
-#if defined(MI_TLS_SLOT)
-static inline void* mi_tls_slot(size_t slot) mi_attr_noexcept;   // forward declaration
-#elif defined(MI_TLS_PTHREAD_SLOT_OFS)
-#include <pthread.h>
-static inline mi_heap_t** mi_tls_pthread_heap_slot(void) {
-  pthread_t self = pthread_self();
-  #if defined(__DragonFly__)
-  if (self==NULL) {
-    mi_heap_t* pheap_main = _mi_heap_main_get();
-    return &pheap_main;
-  }
-  #endif
-  return (mi_heap_t**)((uint8_t*)self + MI_TLS_PTHREAD_SLOT_OFS);
-}
-#elif defined(MI_TLS_PTHREAD)
-#include <pthread.h>
-extern pthread_key_t _mi_heap_default_key;
-#endif
-
-// Default heap to allocate from (if not using TLS- or pthread slots).
-// Do not use this directly but use through `mi_heap_get_default()` (or the unchecked `mi_get_default_heap`).
-// This thread local variable is only used when neither MI_TLS_SLOT, MI_TLS_PTHREAD, or MI_TLS_PTHREAD_SLOT_OFS are defined.
-// However, on the Apple M1 we do use the address of this variable as the unique thread-id (issue #356).
-extern mi_decl_thread mi_heap_t* _mi_heap_default;  // default heap to allocate from
-
-static inline mi_heap_t* mi_get_default_heap(void) {
-#if defined(MI_TLS_SLOT)
-  mi_heap_t* heap = (mi_heap_t*)mi_tls_slot(MI_TLS_SLOT);
-  return (mi_unlikely(heap == NULL) ? (mi_heap_t*)&_mi_heap_empty : heap);
-#elif defined(MI_TLS_PTHREAD_SLOT_OFS)
-  mi_heap_t* heap = *mi_tls_pthread_heap_slot();
-  return (mi_unlikely(heap == NULL) ? (mi_heap_t*)&_mi_heap_empty : heap);
-#elif defined(MI_TLS_PTHREAD)
-  mi_heap_t* heap = (mi_unlikely(_mi_heap_default_key == (pthread_key_t)(-1)) ? _mi_heap_main_get() : (mi_heap_t*)pthread_getspecific(_mi_heap_default_key));
-  return (mi_unlikely(heap == NULL) ? (mi_heap_t*)&_mi_heap_empty : heap);
-#else
-  #if defined(MI_TLS_RECURSE_GUARD)
-  if (mi_unlikely(!_mi_process_is_initialized)) return _mi_heap_main_get();
-  #endif
-  return _mi_heap_default;
-#endif
-}
-
-static inline bool mi_heap_is_default(const mi_heap_t* heap) {
-  return (heap == mi_get_default_heap());
-}
 
 static inline bool mi_heap_is_backing(const mi_heap_t* heap) {
   return (heap->tld->heap_backing == heap);
@@ -380,30 +417,34 @@ static inline mi_page_t* _mi_heap_get_free_small_page(mi_heap_t* heap, size_t si
   return heap->pages_free_direct[idx];
 }
 
-// Get the page belonging to a certain size class
-static inline mi_page_t* _mi_get_free_small_page(size_t size) {
-  return _mi_heap_get_free_small_page(mi_get_default_heap(), size);
-}
-
 // Segment that contains the pointer
+// Large aligned blocks may be aligned at N*MI_SEGMENT_SIZE (inside a huge segment > MI_SEGMENT_SIZE),
+// and we need align "down" to the segment info which is `MI_SEGMENT_SIZE` bytes before it;
+// therefore we align one byte before `p`.
+// We check for NULL afterwards on 64-bit systems to improve codegen for `mi_free`.
 static inline mi_segment_t* _mi_ptr_segment(const void* p) {
-  // mi_assert_internal(p != NULL);
-  return (mi_segment_t*)((uintptr_t)p & ~MI_SEGMENT_MASK);
+  mi_segment_t* const segment = (mi_segment_t*)(((uintptr_t)p - 1) & ~MI_SEGMENT_MASK);
+  #if MI_INTPTR_SIZE <= 4
+  return (p==NULL ? NULL : segment);
+  #else
+  return ((intptr_t)segment <= 0 ? NULL : segment);
+  #endif
 }
 
 // Segment belonging to a page
 static inline mi_segment_t* _mi_page_segment(const mi_page_t* page) {
+  mi_assert_internal(page!=NULL);
   mi_segment_t* segment = _mi_ptr_segment(page);
   mi_assert_internal(segment == NULL || page == &segment->pages[page->segment_idx]);
   return segment;
 }
 
 // used internally
-static inline uintptr_t _mi_segment_page_idx_of(const mi_segment_t* segment, const void* p) {
+static inline size_t _mi_segment_page_idx_of(const mi_segment_t* segment, const void* p) {
   // if (segment->page_size > MI_SEGMENT_SIZE) return &segment->pages[0];  // huge pages
   ptrdiff_t diff = (uint8_t*)p - (uint8_t*)segment;
-  mi_assert_internal(diff >= 0 && (size_t)diff < MI_SEGMENT_SIZE);
-  uintptr_t idx = (uintptr_t)diff >> segment->page_shift;
+  mi_assert_internal(diff >= 0 && (size_t)diff <= MI_SEGMENT_SIZE /* for huge alignment it can be equal */);
+  size_t idx = (size_t)diff >> segment->page_shift;
   mi_assert_internal(idx < segment->capacity);
   mi_assert_internal(segment->page_kind <= MI_PAGE_MEDIUM || idx == 0);
   return idx;
@@ -411,34 +452,33 @@ static inline uintptr_t _mi_segment_page_idx_of(const mi_segment_t* segment, con
 
 // Get the page containing the pointer
 static inline mi_page_t* _mi_segment_page_of(const mi_segment_t* segment, const void* p) {
-  uintptr_t idx = _mi_segment_page_idx_of(segment, p);
+  size_t idx = _mi_segment_page_idx_of(segment, p);
   return &((mi_segment_t*)segment)->pages[idx];
 }
 
 // Quick page start for initialized pages
-static inline uint8_t* _mi_page_start(const mi_segment_t* segment, const mi_page_t* page, size_t* page_size) {
-  const size_t bsize = page->xblock_size;
-  mi_assert_internal(bsize > 0 && (bsize%sizeof(void*)) == 0);
-  return _mi_segment_page_start(segment, page, bsize, page_size, NULL);
+static inline uint8_t* mi_page_start(const mi_page_t* page) {
+  mi_assert_internal(page->page_start != NULL);
+  mi_assert_expensive(_mi_segment_page_start(_mi_page_segment(page),page,NULL) == page->page_start);
+  return page->page_start;
 }
 
 // Get the page containing the pointer
 static inline mi_page_t* _mi_ptr_page(void* p) {
+  mi_assert_internal(p!=NULL);
   return _mi_segment_page_of(_mi_ptr_segment(p), p);
 }
 
-// Get the block size of a page (special cased for huge objects)
+// Get the block size of a page (special case for huge objects)
 static inline size_t mi_page_block_size(const mi_page_t* page) {
-  const size_t bsize = page->xblock_size;
-  mi_assert_internal(bsize > 0);
-  if (mi_likely(bsize < MI_HUGE_BLOCK_SIZE)) {
-    return bsize;
-  }
-  else {
-    size_t psize;
-    _mi_segment_page_start(_mi_page_segment(page), page, bsize, &psize, NULL);
-    return psize;
-  }
+  mi_assert_internal(page->block_size > 0);
+  return page->block_size;
+}
+
+static inline bool mi_page_is_huge(const mi_page_t* page) {
+  mi_assert_internal((page->is_huge && _mi_page_segment(page)->page_kind == MI_PAGE_HUGE) ||
+                     (!page->is_huge && _mi_page_segment(page)->page_kind != MI_PAGE_HUGE));
+  return page->is_huge;
 }
 
 // Get the usable block size of a page without fixed padding.
@@ -447,6 +487,10 @@ static inline size_t mi_page_usable_block_size(const mi_page_t* page) {
   return mi_page_block_size(page) - MI_PADDING_SIZE;
 }
 
+// size of a segment
+static inline size_t mi_segment_size(mi_segment_t* segment) {
+  return segment->segment_size;
+}
 
 // Thread free access
 static inline mi_block_t* mi_page_thread_free(const mi_page_t* page) {
@@ -465,6 +509,7 @@ static inline mi_heap_t* mi_page_heap(const mi_page_t* page) {
 static inline void mi_page_set_heap(mi_page_t* page, mi_heap_t* heap) {
   mi_assert_internal(mi_page_thread_free_flag(page) != MI_DELAYED_FREEING);
   mi_atomic_store_release(&page->xheap,(uintptr_t)heap);
+  if (heap != NULL) { page->heap_tag = heap->tag; }
 }
 
 // Thread free flag helpers
@@ -569,8 +614,8 @@ static inline bool mi_is_in_same_page(const void* p, const void* q) {
   mi_segment_t* segmentp = _mi_ptr_segment(p);
   mi_segment_t* segmentq = _mi_ptr_segment(q);
   if (segmentp != segmentq) return false;
-  uintptr_t idxp = _mi_segment_page_idx_of(segmentp, p);
-  uintptr_t idxq = _mi_segment_page_idx_of(segmentq, q);
+  size_t idxp = _mi_segment_page_idx_of(segmentp, p);
+  size_t idxq = _mi_segment_page_idx_of(segmentq, q);
   return (idxp == idxq);
 }
 
@@ -585,30 +630,36 @@ static inline uintptr_t mi_rotr(uintptr_t x, uintptr_t shift) {
 
 static inline void* mi_ptr_decode(const void* null, const mi_encoded_t x, const uintptr_t* keys) {
   void* p = (void*)(mi_rotr(x - keys[0], keys[0]) ^ keys[1]);
-  return (mi_unlikely(p==null) ? NULL : p);
+  return (p==null ? NULL : p);
 }
 
 static inline mi_encoded_t mi_ptr_encode(const void* null, const void* p, const uintptr_t* keys) {
-  uintptr_t x = (uintptr_t)(mi_unlikely(p==NULL) ? null : p);
+  uintptr_t x = (uintptr_t)(p==NULL ? null : p);
   return mi_rotl(x ^ keys[1], keys[0]) + keys[0];
 }
 
 static inline mi_block_t* mi_block_nextx( const void* null, const mi_block_t* block, const uintptr_t* keys ) {
+  mi_track_mem_defined(block,sizeof(mi_block_t));
+  mi_block_t* next;
   #ifdef MI_ENCODE_FREELIST
-  return (mi_block_t*)mi_ptr_decode(null, block->next, keys);
+  next = (mi_block_t*)mi_ptr_decode(null, block->next, keys);
   #else
-  UNUSED(keys); UNUSED(null);
-  return (mi_block_t*)block->next;
+  MI_UNUSED(keys); MI_UNUSED(null);
+  next = (mi_block_t*)block->next;
   #endif
+  mi_track_mem_noaccess(block,sizeof(mi_block_t));
+  return next;
 }
 
 static inline void mi_block_set_nextx(const void* null, mi_block_t* block, const mi_block_t* next, const uintptr_t* keys) {
+  mi_track_mem_undefined(block,sizeof(mi_block_t));
   #ifdef MI_ENCODE_FREELIST
   block->next = mi_ptr_encode(null, next, keys);
   #else
-  UNUSED(keys); UNUSED(null);
+  MI_UNUSED(keys); MI_UNUSED(null);
   block->next = (mi_encoded_t)next;
   #endif
+  mi_track_mem_noaccess(block,sizeof(mi_block_t));
 }
 
 static inline mi_block_t* mi_block_next(const mi_page_t* page, const mi_block_t* block) {
@@ -616,13 +667,13 @@ static inline mi_block_t* mi_block_next(const mi_page_t* page, const mi_block_t*
   mi_block_t* next = mi_block_nextx(page,block,page->keys);
   // check for free list corruption: is `next` at least in the same page?
   // TODO: check if `next` is `page->block_size` aligned?
-  if (mi_unlikely(next!=NULL && !mi_is_in_same_page(block, next))) {
+  if mi_unlikely(next!=NULL && !mi_is_in_same_page(block, next)) {
     _mi_error_message(EFAULT, "corrupted free list entry of size %zub at %p: value 0x%zx\n", mi_page_block_size(page), block, (uintptr_t)next);
     next = NULL;
   }
   return next;
   #else
-  UNUSED(page);
+  MI_UNUSED(page);
   return mi_block_nextx(page,block,NULL);
   #endif
 }
@@ -631,11 +682,36 @@ static inline void mi_block_set_next(const mi_page_t* page, mi_block_t* block, c
   #ifdef MI_ENCODE_FREELIST
   mi_block_set_nextx(page,block,next, page->keys);
   #else
-  UNUSED(page);
+  MI_UNUSED(page);
   mi_block_set_nextx(page,block,next,NULL);
   #endif
 }
 
+
+/* -----------------------------------------------------------
+  memory id's
+----------------------------------------------------------- */
+
+static inline mi_memid_t _mi_memid_create(mi_memkind_t memkind) {
+  mi_memid_t memid;
+  _mi_memzero_var(memid);
+  memid.memkind = memkind;
+  return memid;
+}
+
+static inline mi_memid_t _mi_memid_none(void) {
+  return _mi_memid_create(MI_MEM_NONE);
+}
+
+static inline mi_memid_t _mi_memid_create_os(bool committed, bool is_zero, bool is_large) {
+  mi_memid_t memid = _mi_memid_create(MI_MEM_OS);
+  memid.initially_committed = committed;
+  memid.initially_zero = is_zero;
+  memid.is_pinned = is_large;
+  return memid;
+}
+
+
 // -------------------------------------------------------------------
 // Fast "random" shuffle
 // -------------------------------------------------------------------
@@ -669,102 +745,16 @@ size_t _mi_os_numa_node_count_get(void);
 
 extern _Atomic(size_t) _mi_numa_node_count;
 static inline int _mi_os_numa_node(mi_os_tld_t* tld) {
-  if (mi_likely(mi_atomic_load_relaxed(&_mi_numa_node_count) == 1)) return 0;
+  if mi_likely(mi_atomic_load_relaxed(&_mi_numa_node_count) == 1) { return 0; }
   else return _mi_os_numa_node_get(tld);
 }
 static inline size_t _mi_os_numa_node_count(void) {
   const size_t count = mi_atomic_load_relaxed(&_mi_numa_node_count);
-  if (mi_likely(count>0)) return count;
+  if mi_likely(count > 0) { return count; }
   else return _mi_os_numa_node_count_get();
 }
 
 
-// -------------------------------------------------------------------
-// Getting the thread id should be performant as it is called in the
-// fast path of `_mi_free` and we specialize for various platforms.
-// -------------------------------------------------------------------
-#if defined(_WIN32)
-#define WIN32_LEAN_AND_MEAN
-#include <windows.h>
-static inline uintptr_t _mi_thread_id(void) mi_attr_noexcept {
-  // Windows: works on Intel and ARM in both 32- and 64-bit
-  return (uintptr_t)NtCurrentTeb();
-}
-
-#elif defined(__GNUC__) && \
-      (defined(__x86_64__) || defined(__i386__) || defined(__arm__) || defined(__aarch64__))
-
-// TLS register on x86 is in the FS or GS register, see: https://akkadia.org/drepper/tls.pdf
-static inline void* mi_tls_slot(size_t slot) mi_attr_noexcept {
-  void* res;
-  const size_t ofs = (slot*sizeof(void*));
-#if defined(__i386__)
-  __asm__("movl %%gs:%1, %0" : "=r" (res) : "m" (*((void**)ofs)) : );  // 32-bit always uses GS
-#elif defined(__APPLE__) && defined(__x86_64__)
-  __asm__("movq %%gs:%1, %0" : "=r" (res) : "m" (*((void**)ofs)) : );  // x86_64 macOSX uses GS
-#elif defined(__x86_64__) && (MI_INTPTR_SIZE==4)
-  __asm__("movl %%fs:%1, %0" : "=r" (res) : "m" (*((void**)ofs)) : );  // x32 ABI
-#elif defined(__x86_64__)
-  __asm__("movq %%fs:%1, %0" : "=r" (res) : "m" (*((void**)ofs)) : );  // x86_64 Linux, BSD uses FS
-#elif defined(__arm__)
-  void** tcb; UNUSED(ofs);
-  __asm__ volatile ("mrc p15, 0, %0, c13, c0, 3\nbic %0, %0, #3" : "=r" (tcb));
-  res = tcb[slot];
-#elif defined(__aarch64__)
-  void** tcb; UNUSED(ofs);
-#if defined(__APPLE__) // M1, issue #343
-  __asm__ volatile ("mrs %0, tpidrro_el0" : "=r" (tcb));
-  tcb = (void**)((uintptr_t)tcb & ~0x07UL);  // clear lower 3 bits
-#else
-  __asm__ volatile ("mrs %0, tpidr_el0" : "=r" (tcb));
-#endif
-  res = tcb[slot];
-#endif
-  return res;
-}
-
-// setting is only used on macOSX for now
-static inline void mi_tls_slot_set(size_t slot, void* value) mi_attr_noexcept {
-  const size_t ofs = (slot*sizeof(void*));
-#if defined(__i386__)
-  __asm__("movl %1,%%gs:%0" : "=m" (*((void**)ofs)) : "rn" (value) : );  // 32-bit always uses GS
-#elif defined(__APPLE__) && defined(__x86_64__)
-  __asm__("movq %1,%%gs:%0" : "=m" (*((void**)ofs)) : "rn" (value) : );  // x86_64 macOSX uses GS
-#elif defined(__x86_64__) && (MI_INTPTR_SIZE==4)
-  __asm__("movl %1,%%fs:%1" : "=m" (*((void**)ofs)) : "rn" (value) : );  // x32 ABI
-#elif defined(__x86_64__)
-  __asm__("movq %1,%%fs:%1" : "=m" (*((void**)ofs)) : "rn" (value) : );  // x86_64 Linux, BSD uses FS
-#elif defined(__arm__)
-  void** tcb; UNUSED(ofs);
-  __asm__ volatile ("mrc p15, 0, %0, c13, c0, 3\nbic %0, %0, #3" : "=r" (tcb));
-  tcb[slot] = value;
-#elif defined(__aarch64__)
-  void** tcb; UNUSED(ofs);
-#if defined(__APPLE__) // M1, issue #343
-  __asm__ volatile ("mrs %0, tpidrro_el0" : "=r" (tcb));
-  tcb = (void**)((uintptr_t)tcb & ~0x07UL);  // clear lower 3 bits
-#else
-  __asm__ volatile ("mrs %0, tpidr_el0" : "=r" (tcb));
-#endif
-  tcb[slot] = value;
-#endif
-}
-
-static inline uintptr_t _mi_thread_id(void) mi_attr_noexcept {
-#if defined(__BIONIC__) && (defined(__arm__) || defined(__aarch64__))
-  // on Android, slot 1 is the thread ID (pointer to pthread internal struct)
-  return (uintptr_t)mi_tls_slot(1);
-#else
-  // in all our other targets, slot 0 is the pointer to the thread control block
-  return (uintptr_t)mi_tls_slot(0);
-#endif
-}
-#else
-// otherwise use standard C
-static inline uintptr_t _mi_thread_id(void) mi_attr_noexcept {
-  return (uintptr_t)&_mi_heap_default;
-}
-#endif
 
 // -----------------------------------------------------------------------
 // Count bits: trailing or leading zeros (with MI_INTPTR_BITS on all zero)
@@ -791,9 +781,10 @@ static inline size_t mi_ctz(uintptr_t x) {
 #endif
 }
 
-#elif defined(_MSC_VER) 
+#elif defined(_MSC_VER)
 
 #include <limits.h>       // LONG_MAX
+#include <intrin.h>       // BitScanReverse64
 #define MI_HAVE_FAST_BITSCAN
 static inline size_t mi_clz(uintptr_t x) {
   if (x==0) return MI_INTPTR_BITS;
@@ -802,7 +793,7 @@ static inline size_t mi_clz(uintptr_t x) {
   _BitScanReverse(&idx, x);
 #else
   _BitScanReverse64(&idx, x);
-#endif  
+#endif
   return ((MI_INTPTR_BITS - 1) - idx);
 }
 static inline size_t mi_ctz(uintptr_t x) {
@@ -812,7 +803,7 @@ static inline size_t mi_ctz(uintptr_t x) {
   _BitScanForward(&idx, x);
 #else
   _BitScanForward64(&idx, x);
-#endif  
+#endif
   return idx;
 }
 
@@ -842,7 +833,7 @@ static inline size_t mi_clz32(uint32_t x) {
 }
 
 static inline size_t mi_clz(uintptr_t x) {
-  if (x==0) return MI_INTPTR_BITS;  
+  if (x==0) return MI_INTPTR_BITS;
 #if (MI_INTPTR_BITS <= 32)
   return mi_clz32((uint32_t)x);
 #else
@@ -873,44 +864,57 @@ static inline size_t mi_bsr(uintptr_t x) {
 // ---------------------------------------------------------------------------------
 // Provide our own `_mi_memcpy` for potential performance optimizations.
 //
-// For now, only on Windows with msvc/clang-cl we optimize to `rep movsb` if 
-// we happen to run on x86/x64 cpu's that have "fast short rep movsb" (FSRM) support 
-// (AMD Zen3+ (~2020) or Intel Ice Lake+ (~2017). See also issue #201 and pr #253. 
+// For now, only on Windows with msvc/clang-cl we optimize to `rep movsb` if
+// we happen to run on x86/x64 cpu's that have "fast short rep movsb" (FSRM) support
+// (AMD Zen3+ (~2020) or Intel Ice Lake+ (~2017). See also issue #201 and pr #253.
 // ---------------------------------------------------------------------------------
 
-#if defined(_WIN32) && (defined(_M_IX86) || defined(_M_X64))
+#if !MI_TRACK_ENABLED && defined(_WIN32) && (defined(_M_IX86) || defined(_M_X64))
 #include <intrin.h>
-#include <string.h>
 extern bool _mi_cpu_has_fsrm;
 static inline void _mi_memcpy(void* dst, const void* src, size_t n) {
   if (_mi_cpu_has_fsrm) {
     __movsb((unsigned char*)dst, (const unsigned char*)src, n);
   }
   else {
-    memcpy(dst, src, n); // todo: use noinline?
+    memcpy(dst, src, n);
+  }
+}
+static inline void _mi_memzero(void* dst, size_t n) {
+  if (_mi_cpu_has_fsrm) {
+    __stosb((unsigned char*)dst, 0, n);
+  }
+  else {
+    memset(dst, 0, n);
   }
 }
 #else
-#include <string.h>
 static inline void _mi_memcpy(void* dst, const void* src, size_t n) {
   memcpy(dst, src, n);
 }
+static inline void _mi_memzero(void* dst, size_t n) {
+  memset(dst, 0, n);
+}
 #endif
 
-
 // -------------------------------------------------------------------------------
-// The `_mi_memcpy_aligned` can be used if the pointers are machine-word aligned 
+// The `_mi_memcpy_aligned` can be used if the pointers are machine-word aligned
 // This is used for example in `mi_realloc`.
 // -------------------------------------------------------------------------------
 
-#if (__GNUC__ >= 4) || defined(__clang__)
+#if (defined(__GNUC__) && (__GNUC__ >= 4)) || defined(__clang__)
 // On GCC/CLang we provide a hint that the pointers are word aligned.
-#include <string.h>
 static inline void _mi_memcpy_aligned(void* dst, const void* src, size_t n) {
   mi_assert_internal(((uintptr_t)dst % MI_INTPTR_SIZE == 0) && ((uintptr_t)src % MI_INTPTR_SIZE == 0));
   void* adst = __builtin_assume_aligned(dst, MI_INTPTR_SIZE);
   const void* asrc = __builtin_assume_aligned(src, MI_INTPTR_SIZE);
-  memcpy(adst, asrc, n);
+  _mi_memcpy(adst, asrc, n);
+}
+
+static inline void _mi_memzero_aligned(void* dst, size_t n) {
+  mi_assert_internal((uintptr_t)dst % MI_INTPTR_SIZE == 0);
+  void* adst = __builtin_assume_aligned(dst, MI_INTPTR_SIZE);
+  _mi_memzero(adst, n);
 }
 #else
 // Default fallback on `_mi_memcpy`
@@ -918,6 +922,11 @@ static inline void _mi_memcpy_aligned(void* dst, const void* src, size_t n) {
   mi_assert_internal(((uintptr_t)dst % MI_INTPTR_SIZE == 0) && ((uintptr_t)src % MI_INTPTR_SIZE == 0));
   _mi_memcpy(dst, src, n);
 }
+
+static inline void _mi_memzero_aligned(void* dst, size_t n) {
+  mi_assert_internal((uintptr_t)dst % MI_INTPTR_SIZE == 0);
+  _mi_memzero(dst, n);
+}
 #endif
 
 
diff --git a/contrib/libs/mimalloc/include/mimalloc/prim.h b/contrib/libs/mimalloc/include/mimalloc/prim.h
new file mode 100644
index 000000000000..3f4574ddd927
--- /dev/null
+++ b/contrib/libs/mimalloc/include/mimalloc/prim.h
@@ -0,0 +1,373 @@
+/* ----------------------------------------------------------------------------
+Copyright (c) 2018-2023, Microsoft Research, Daan Leijen
+This is free software; you can redistribute it and/or modify it under the
+terms of the MIT license. A copy of the license can be found in the file
+"LICENSE" at the root of this distribution.
+-----------------------------------------------------------------------------*/
+#pragma once
+#ifndef MIMALLOC_PRIM_H
+#define MIMALLOC_PRIM_H
+
+
+// --------------------------------------------------------------------------
+// This file specifies the primitive portability API.
+// Each OS/host needs to implement these primitives, see `src/prim`
+// for implementations on Window, macOS, WASI, and Linux/Unix.
+//
+// note: on all primitive functions, we always have result parameters != NULL, and:
+//  addr != NULL and page aligned
+//  size > 0     and page aligned
+//  the return value is an error code as an `int` where 0 is success
+// --------------------------------------------------------------------------
+
+// OS memory configuration
+typedef struct mi_os_mem_config_s {
+  size_t  page_size;            // default to 4KiB
+  size_t  large_page_size;      // 0 if not supported, usually 2MiB (4MiB on Windows)
+  size_t  alloc_granularity;    // smallest allocation size (usually 4KiB, on Windows 64KiB)
+  bool    has_overcommit;       // can we reserve more memory than can be actually committed?
+  bool    has_partial_free;     // can allocated blocks be freed partially? (true for mmap, false for VirtualAlloc)
+  bool    has_virtual_reserve;  // supports virtual address space reservation? (if true we can reserve virtual address space without using commit or physical memory)
+} mi_os_mem_config_t;
+
+// Initialize
+void _mi_prim_mem_init( mi_os_mem_config_t* config );
+
+// Free OS memory
+int _mi_prim_free(void* addr, size_t size );
+
+// Allocate OS memory. Return NULL on error.
+// The `try_alignment` is just a hint and the returned pointer does not have to be aligned.
+// If `commit` is false, the virtual memory range only needs to be reserved (with no access)
+// which will later be committed explicitly using `_mi_prim_commit`.
+// `is_zero` is set to true if the memory was zero initialized (as on most OS's)
+// pre: !commit => !allow_large
+//      try_alignment >= _mi_os_page_size() and a power of 2
+int _mi_prim_alloc(size_t size, size_t try_alignment, bool commit, bool allow_large, bool* is_large, bool* is_zero, void** addr);
+
+// Commit memory. Returns error code or 0 on success.
+// For example, on Linux this would make the memory PROT_READ|PROT_WRITE.
+// `is_zero` is set to true if the memory was zero initialized (e.g. on Windows)
+int _mi_prim_commit(void* addr, size_t size, bool* is_zero);
+
+// Decommit memory. Returns error code or 0 on success. The `needs_recommit` result is true
+// if the memory would need to be re-committed. For example, on Windows this is always true,
+// but on Linux we could use MADV_DONTNEED to decommit which does not need a recommit.
+// pre: needs_recommit != NULL
+int _mi_prim_decommit(void* addr, size_t size, bool* needs_recommit);
+
+// Reset memory. The range keeps being accessible but the content might be reset.
+// Returns error code or 0 on success.
+int _mi_prim_reset(void* addr, size_t size);
+
+// Protect memory. Returns error code or 0 on success.
+int _mi_prim_protect(void* addr, size_t size, bool protect);
+
+// Allocate huge (1GiB) pages possibly associated with a NUMA node.
+// `is_zero` is set to true if the memory was zero initialized (as on most OS's)
+// pre: size > 0  and a multiple of 1GiB.
+//      numa_node is either negative (don't care), or a numa node number.
+int _mi_prim_alloc_huge_os_pages(void* hint_addr, size_t size, int numa_node, bool* is_zero, void** addr);
+
+// Return the current NUMA node
+size_t _mi_prim_numa_node(void);
+
+// Return the number of logical NUMA nodes
+size_t _mi_prim_numa_node_count(void);
+
+// Clock ticks
+mi_msecs_t _mi_prim_clock_now(void);
+
+// Return process information (only for statistics)
+typedef struct mi_process_info_s {
+  mi_msecs_t  elapsed;
+  mi_msecs_t  utime;
+  mi_msecs_t  stime;
+  size_t      current_rss;
+  size_t      peak_rss;
+  size_t      current_commit;
+  size_t      peak_commit;
+  size_t      page_faults;
+} mi_process_info_t;
+
+void _mi_prim_process_info(mi_process_info_t* pinfo);
+
+// Default stderr output. (only for warnings etc. with verbose enabled)
+// msg != NULL && _mi_strlen(msg) > 0
+void _mi_prim_out_stderr( const char* msg );
+
+// Get an environment variable. (only for options)
+// name != NULL, result != NULL, result_size >= 64
+bool _mi_prim_getenv(const char* name, char* result, size_t result_size);
+
+
+// Fill a buffer with strong randomness; return `false` on error or if
+// there is no strong randomization available.
+bool _mi_prim_random_buf(void* buf, size_t buf_len);
+
+// Called on the first thread start, and should ensure `_mi_thread_done` is called on thread termination.
+void _mi_prim_thread_init_auto_done(void);
+
+// Called on process exit and may take action to clean up resources associated with the thread auto done.
+void _mi_prim_thread_done_auto_done(void);
+
+// Called when the default heap for a thread changes
+void _mi_prim_thread_associate_default_heap(mi_heap_t* heap);
+
+
+//-------------------------------------------------------------------
+// Thread id: `_mi_prim_thread_id()`
+//
+// Getting the thread id should be performant as it is called in the
+// fast path of `_mi_free` and we specialize for various platforms as
+// inlined definitions. Regular code should call `init.c:_mi_thread_id()`.
+// We only require _mi_prim_thread_id() to return a unique id
+// for each thread (unequal to zero).
+//-------------------------------------------------------------------
+
+// On some libc + platform combinations we can directly access a thread-local storage (TLS) slot.
+// The TLS layout depends on both the OS and libc implementation so we use specific tests for each main platform.
+// If you test on another platform and it works please send a PR :-)
+// see also https://akkadia.org/drepper/tls.pdf for more info on the TLS register.
+//
+// Note: we would like to prefer `__builtin_thread_pointer()` nowadays instead of using assembly,
+// but unfortunately we can not detect support reliably (see issue #883)
+// We also use it on Apple OS as we use a TLS slot for the default heap there.
+#if defined(__GNUC__) && ( \
+           (defined(__GLIBC__)   && (defined(__x86_64__) || defined(__i386__) || defined(__arm__) || defined(__aarch64__))) \
+        || (defined(__APPLE__)   && (defined(__x86_64__) || defined(__aarch64__) || defined(__POWERPC__))) \
+        || (defined(__BIONIC__)  && (defined(__x86_64__) || defined(__i386__) || defined(__arm__) || defined(__aarch64__))) \
+        || (defined(__FreeBSD__) && (defined(__x86_64__) || defined(__i386__) || defined(__aarch64__))) \
+        || (defined(__OpenBSD__) && (defined(__x86_64__) || defined(__i386__) || defined(__aarch64__))) \
+      )
+
+#define MI_HAS_TLS_SLOT
+
+static inline void* mi_prim_tls_slot(size_t slot) mi_attr_noexcept {
+  void* res;
+  const size_t ofs = (slot*sizeof(void*));
+  #if defined(__i386__)
+    __asm__("movl %%gs:%1, %0" : "=r" (res) : "m" (*((void**)ofs)) : );  // x86 32-bit always uses GS
+  #elif defined(__APPLE__) && defined(__x86_64__)
+    __asm__("movq %%gs:%1, %0" : "=r" (res) : "m" (*((void**)ofs)) : );  // x86_64 macOSX uses GS
+  #elif defined(__x86_64__) && (MI_INTPTR_SIZE==4)
+    __asm__("movl %%fs:%1, %0" : "=r" (res) : "m" (*((void**)ofs)) : );  // x32 ABI
+  #elif defined(__x86_64__)
+    __asm__("movq %%fs:%1, %0" : "=r" (res) : "m" (*((void**)ofs)) : );  // x86_64 Linux, BSD uses FS
+  #elif defined(__arm__)
+    void** tcb; MI_UNUSED(ofs);
+    __asm__ volatile ("mrc p15, 0, %0, c13, c0, 3\nbic %0, %0, #3" : "=r" (tcb));
+    res = tcb[slot];
+  #elif defined(__aarch64__)
+    void** tcb; MI_UNUSED(ofs);
+    #if defined(__APPLE__) // M1, issue #343
+    __asm__ volatile ("mrs %0, tpidrro_el0\nbic %0, %0, #7" : "=r" (tcb));
+    #else
+    __asm__ volatile ("mrs %0, tpidr_el0" : "=r" (tcb));
+    #endif
+    res = tcb[slot];
+  #elif defined(__APPLE__) && defined(__POWERPC__) // ppc, issue #781
+    MI_UNUSED(ofs);
+    res = pthread_getspecific(slot);
+  #endif
+  return res;
+}
+
+// setting a tls slot is only used on macOS for now
+static inline void mi_prim_tls_slot_set(size_t slot, void* value) mi_attr_noexcept {
+  const size_t ofs = (slot*sizeof(void*));
+  #if defined(__i386__)
+    __asm__("movl %1,%%gs:%0" : "=m" (*((void**)ofs)) : "rn" (value) : );  // 32-bit always uses GS
+  #elif defined(__APPLE__) && defined(__x86_64__)
+    __asm__("movq %1,%%gs:%0" : "=m" (*((void**)ofs)) : "rn" (value) : );  // x86_64 macOS uses GS
+  #elif defined(__x86_64__) && (MI_INTPTR_SIZE==4)
+    __asm__("movl %1,%%fs:%0" : "=m" (*((void**)ofs)) : "rn" (value) : );  // x32 ABI
+  #elif defined(__x86_64__)
+    __asm__("movq %1,%%fs:%0" : "=m" (*((void**)ofs)) : "rn" (value) : );  // x86_64 Linux, BSD uses FS
+  #elif defined(__arm__)
+    void** tcb; MI_UNUSED(ofs);
+    __asm__ volatile ("mrc p15, 0, %0, c13, c0, 3\nbic %0, %0, #3" : "=r" (tcb));
+    tcb[slot] = value;
+  #elif defined(__aarch64__)
+    void** tcb; MI_UNUSED(ofs);
+    #if defined(__APPLE__) // M1, issue #343
+    __asm__ volatile ("mrs %0, tpidrro_el0\nbic %0, %0, #7" : "=r" (tcb));
+    #else
+    __asm__ volatile ("mrs %0, tpidr_el0" : "=r" (tcb));
+    #endif
+    tcb[slot] = value;
+  #elif defined(__APPLE__) && defined(__POWERPC__) // ppc, issue #781
+    MI_UNUSED(ofs);
+    pthread_setspecific(slot, value);
+  #endif
+}
+
+#endif
+
+// Do we have __builtin_thread_pointer? This would be the preferred way to get a unique thread id
+// but unfortunately, it seems we cannot test for this reliably at this time (see issue #883)
+// Nevertheless, it seems needed on older graviton platforms (see issue #851).
+// For now, we only enable this for specific platforms.
+#if !defined(__APPLE__)  /* on apple (M1) the wrong register is read (tpidr_el0 instead of tpidrro_el0) so fall back to TLS slot assembly (<https://github.com/microsoft/mimalloc/issues/343#issuecomment-763272369>)*/ \
+    && !defined(MI_LIBC_MUSL) \
+    && (!defined(__clang_major__) || __clang_major__ >= 14)  /* older clang versions emit bad code; fall back to using the TLS slot (<https://lore.kernel.org/linux-arm-kernel/202110280952.352F66D8@keescook/T/>) */
+  #if    (defined(__GNUC__) && (__GNUC__ >= 7)  && defined(__aarch64__)) /* aarch64 for older gcc versions (issue #851) */ \
+      || (defined(__GNUC__) && (__GNUC__ >= 11) && defined(__x86_64__)) \
+      || (defined(__clang_major__) && (__clang_major__ >= 14) && (defined(__aarch64__) || defined(__x86_64__)))
+    #define MI_USE_BUILTIN_THREAD_POINTER  1
+  #endif
+#endif
+
+
+
+// defined in `init.c`; do not use these directly
+extern mi_decl_thread mi_heap_t* _mi_heap_default;  // default heap to allocate from
+extern bool _mi_process_is_initialized;             // has mi_process_init been called?
+
+static inline mi_threadid_t _mi_prim_thread_id(void) mi_attr_noexcept;
+
+// Get a unique id for the current thread.
+#if defined(MI_PRIM_THREAD_ID)
+
+static inline mi_threadid_t _mi_prim_thread_id(void) mi_attr_noexcept {
+  return MI_PRIM_THREAD_ID();  // used for example by CPython for a free threaded build (see python/cpython#115488)
+}
+
+#elif defined(_WIN32)
+
+#ifndef WIN32_LEAN_AND_MEAN
+#define WIN32_LEAN_AND_MEAN
+#endif
+#include <windows.h>
+static inline mi_threadid_t _mi_prim_thread_id(void) mi_attr_noexcept {
+  // Windows: works on Intel and ARM in both 32- and 64-bit
+  return (uintptr_t)NtCurrentTeb();
+}
+
+#elif MI_USE_BUILTIN_THREAD_POINTER
+
+static inline mi_threadid_t _mi_prim_thread_id(void) mi_attr_noexcept {
+  // Works on most Unix based platforms with recent compilers
+  return (uintptr_t)__builtin_thread_pointer();
+}
+
+#elif defined(MI_HAS_TLS_SLOT)
+
+static inline mi_threadid_t _mi_prim_thread_id(void) mi_attr_noexcept {
+  #if defined(__BIONIC__)
+    // issue #384, #495: on the Bionic libc (Android), slot 1 is the thread id
+    // see: https://github.com/aosp-mirror/platform_bionic/blob/c44b1d0676ded732df4b3b21c5f798eacae93228/libc/platform/bionic/tls_defines.h#L86
+    return (uintptr_t)mi_prim_tls_slot(1);
+  #else
+    // in all our other targets, slot 0 is the thread id
+    // glibc: https://sourceware.org/git/?p=glibc.git;a=blob_plain;f=sysdeps/x86_64/nptl/tls.h
+    // apple: https://github.com/apple/darwin-xnu/blob/main/libsyscall/os/tsd.h#L36
+    return (uintptr_t)mi_prim_tls_slot(0);
+  #endif
+}
+
+#else
+
+// otherwise use portable C, taking the address of a thread local variable (this is still very fast on most platforms).
+static inline mi_threadid_t _mi_prim_thread_id(void) mi_attr_noexcept {
+  return (uintptr_t)&_mi_heap_default;
+}
+
+#endif
+
+
+
+/* ----------------------------------------------------------------------------------------
+The thread local default heap: `_mi_prim_get_default_heap()`
+This is inlined here as it is on the fast path for allocation functions.
+
+On most platforms (Windows, Linux, FreeBSD, NetBSD, etc), this just returns a
+__thread local variable (`_mi_heap_default`). With the initial-exec TLS model this ensures
+that the storage will always be available (allocated on the thread stacks).
+
+On some platforms though we cannot use that when overriding `malloc` since the underlying
+TLS implementation (or the loader) will call itself `malloc` on a first access and recurse.
+We try to circumvent this in an efficient way:
+- macOSX : we use an unused TLS slot from the OS allocated slots (MI_TLS_SLOT). On OSX, the
+           loader itself calls `malloc` even before the modules are initialized.
+- OpenBSD: we use an unused slot from the pthread block (MI_TLS_PTHREAD_SLOT_OFS).
+- DragonFly: defaults are working but seem slow compared to freeBSD (see PR #323)
+------------------------------------------------------------------------------------------- */
+
+static inline mi_heap_t* mi_prim_get_default_heap(void);
+
+#if defined(MI_MALLOC_OVERRIDE)
+#if defined(__APPLE__) // macOS
+  #define MI_TLS_SLOT               89  // seems unused?
+  // other possible unused ones are 9, 29, __PTK_FRAMEWORK_JAVASCRIPTCORE_KEY4 (94), __PTK_FRAMEWORK_GC_KEY9 (112) and __PTK_FRAMEWORK_OLDGC_KEY9 (89)
+  // see <https://github.com/rweichler/substrate/blob/master/include/pthread_machdep.h>
+#elif defined(__OpenBSD__)
+  // use end bytes of a name; goes wrong if anyone uses names > 23 characters (ptrhread specifies 16)
+  // see <https://github.com/openbsd/src/blob/master/lib/libc/include/thread_private.h#L371>
+  #define MI_TLS_PTHREAD_SLOT_OFS   (6*sizeof(int) + 4*sizeof(void*) + 24)
+  // #elif defined(__DragonFly__)
+  // #warning "mimalloc is not working correctly on DragonFly yet."
+  // #define MI_TLS_PTHREAD_SLOT_OFS   (4 + 1*sizeof(void*))  // offset `uniqueid` (also used by gdb?) <https://github.com/DragonFlyBSD/DragonFlyBSD/blob/master/lib/libthread_xu/thread/thr_private.h#L458>
+#elif defined(__ANDROID__)
+  // See issue #381
+  #define MI_TLS_PTHREAD
+#endif
+#endif
+
+
+#if defined(MI_TLS_SLOT)
+# if !defined(MI_HAS_TLS_SLOT)
+#  error "trying to use a TLS slot for the default heap, but the mi_prim_tls_slot primitives are not defined"
+# endif
+
+static inline mi_heap_t* mi_prim_get_default_heap(void) {
+  mi_heap_t* heap = (mi_heap_t*)mi_prim_tls_slot(MI_TLS_SLOT);
+  if mi_unlikely(heap == NULL) {
+    #ifdef __GNUC__
+    __asm(""); // prevent conditional load of the address of _mi_heap_empty
+    #endif
+    heap = (mi_heap_t*)&_mi_heap_empty;
+  }
+  return heap;
+}
+
+#elif defined(MI_TLS_PTHREAD_SLOT_OFS)
+
+static inline mi_heap_t** mi_prim_tls_pthread_heap_slot(void) {
+  pthread_t self = pthread_self();
+  #if defined(__DragonFly__)
+  if (self==NULL) return NULL;
+  #endif
+  return (mi_heap_t**)((uint8_t*)self + MI_TLS_PTHREAD_SLOT_OFS);
+}
+
+static inline mi_heap_t* mi_prim_get_default_heap(void) {
+  mi_heap_t** pheap = mi_prim_tls_pthread_heap_slot();
+  if mi_unlikely(pheap == NULL) return _mi_heap_main_get();
+  mi_heap_t* heap = *pheap;
+  if mi_unlikely(heap == NULL) return (mi_heap_t*)&_mi_heap_empty;
+  return heap;
+}
+
+#elif defined(MI_TLS_PTHREAD)
+
+extern pthread_key_t _mi_heap_default_key;
+static inline mi_heap_t* mi_prim_get_default_heap(void) {
+  mi_heap_t* heap = (mi_unlikely(_mi_heap_default_key == (pthread_key_t)(-1)) ? _mi_heap_main_get() : (mi_heap_t*)pthread_getspecific(_mi_heap_default_key));
+  return (mi_unlikely(heap == NULL) ? (mi_heap_t*)&_mi_heap_empty : heap);
+}
+
+#else // default using a thread local variable; used on most platforms.
+
+static inline mi_heap_t* mi_prim_get_default_heap(void) {
+  #if defined(MI_TLS_RECURSE_GUARD)
+  if (mi_unlikely(!_mi_process_is_initialized)) return _mi_heap_main_get();
+  #endif
+  return _mi_heap_default;
+}
+
+#endif  // mi_prim_get_default_heap()
+
+
+
+#endif  // MIMALLOC_PRIM_H
diff --git a/contrib/libs/mimalloc/include/mimalloc/track.h b/contrib/libs/mimalloc/include/mimalloc/track.h
new file mode 100644
index 000000000000..42ca9071ccf5
--- /dev/null
+++ b/contrib/libs/mimalloc/include/mimalloc/track.h
@@ -0,0 +1,149 @@
+/* ----------------------------------------------------------------------------
+Copyright (c) 2018-2023, Microsoft Research, Daan Leijen
+This is free software; you can redistribute it and/or modify it under the
+terms of the MIT license. A copy of the license can be found in the file
+"LICENSE" at the root of this distribution.
+-----------------------------------------------------------------------------*/
+#pragma once
+#ifndef MIMALLOC_TRACK_H
+#define MIMALLOC_TRACK_H
+
+/* ------------------------------------------------------------------------------------------------------
+Track memory ranges with macros for tools like Valgrind address sanitizer, or other memory checkers.
+These can be defined for tracking allocation:
+
+  #define mi_track_malloc_size(p,reqsize,size,zero)
+  #define mi_track_free_size(p,_size)
+
+The macros are set up such that the size passed to `mi_track_free_size`
+always matches the size of `mi_track_malloc_size`. (currently, `size == mi_usable_size(p)`).
+The `reqsize` is what the user requested, and `size >= reqsize`.
+The `size` is either byte precise (and `size==reqsize`) if `MI_PADDING` is enabled,
+or otherwise it is the usable block size which may be larger than the original request.
+Use `_mi_block_size_of(void* p)` to get the full block size that was allocated (including padding etc).
+The `zero` parameter is `true` if the allocated block is zero initialized.
+
+Optional:
+
+  #define mi_track_align(p,alignedp,offset,size)
+  #define mi_track_resize(p,oldsize,newsize)
+  #define mi_track_init()
+
+The `mi_track_align` is called right after a `mi_track_malloc` for aligned pointers in a block.
+The corresponding `mi_track_free` still uses the block start pointer and original size (corresponding to the `mi_track_malloc`).
+The `mi_track_resize` is currently unused but could be called on reallocations within a block.
+`mi_track_init` is called at program start.
+
+The following macros are for tools like asan and valgrind to track whether memory is 
+defined, undefined, or not accessible at all:
+
+  #define mi_track_mem_defined(p,size)
+  #define mi_track_mem_undefined(p,size)
+  #define mi_track_mem_noaccess(p,size)
+
+-------------------------------------------------------------------------------------------------------*/
+
+#if MI_TRACK_VALGRIND
+// valgrind tool
+
+#define MI_TRACK_ENABLED      1
+#define MI_TRACK_HEAP_DESTROY 1           // track free of individual blocks on heap_destroy
+#define MI_TRACK_TOOL         "valgrind"
+
+#include <valgrind/valgrind.h>
+#include <valgrind/memcheck.h>
+
+#define mi_track_malloc_size(p,reqsize,size,zero) VALGRIND_MALLOCLIKE_BLOCK(p,size,MI_PADDING_SIZE /*red zone*/,zero)
+#define mi_track_free_size(p,_size)               VALGRIND_FREELIKE_BLOCK(p,MI_PADDING_SIZE /*red zone*/)
+#define mi_track_resize(p,oldsize,newsize)        VALGRIND_RESIZEINPLACE_BLOCK(p,oldsize,newsize,MI_PADDING_SIZE /*red zone*/)
+#define mi_track_mem_defined(p,size)              VALGRIND_MAKE_MEM_DEFINED(p,size)
+#define mi_track_mem_undefined(p,size)            VALGRIND_MAKE_MEM_UNDEFINED(p,size)
+#define mi_track_mem_noaccess(p,size)             VALGRIND_MAKE_MEM_NOACCESS(p,size)
+
+#elif MI_TRACK_ASAN
+// address sanitizer
+
+#define MI_TRACK_ENABLED      1
+#define MI_TRACK_HEAP_DESTROY 0
+#define MI_TRACK_TOOL         "asan"
+
+#include <sanitizer/asan_interface.h>
+
+#define mi_track_malloc_size(p,reqsize,size,zero) ASAN_UNPOISON_MEMORY_REGION(p,size)
+#define mi_track_free_size(p,size)                ASAN_POISON_MEMORY_REGION(p,size)
+#define mi_track_mem_defined(p,size)              ASAN_UNPOISON_MEMORY_REGION(p,size)
+#define mi_track_mem_undefined(p,size)            ASAN_UNPOISON_MEMORY_REGION(p,size)
+#define mi_track_mem_noaccess(p,size)             ASAN_POISON_MEMORY_REGION(p,size)
+
+#elif MI_TRACK_ETW
+// windows event tracing
+
+#define MI_TRACK_ENABLED      1
+#define MI_TRACK_HEAP_DESTROY 1
+#define MI_TRACK_TOOL         "ETW"
+
+#ifndef WIN32_LEAN_AND_MEAN
+#define WIN32_LEAN_AND_MEAN
+#endif
+#include <windows.h>
+#error #include "../src/prim/windows/etw.h"
+
+#define mi_track_init()                           EventRegistermicrosoft_windows_mimalloc();
+#define mi_track_malloc_size(p,reqsize,size,zero) EventWriteETW_MI_ALLOC((UINT64)(p), size)
+#define mi_track_free_size(p,size)                EventWriteETW_MI_FREE((UINT64)(p), size)
+
+#else
+// no tracking
+
+#define MI_TRACK_ENABLED      0
+#define MI_TRACK_HEAP_DESTROY 0 
+#define MI_TRACK_TOOL         "none"
+
+#define mi_track_malloc_size(p,reqsize,size,zero)
+#define mi_track_free_size(p,_size)
+
+#endif
+
+// -------------------
+// Utility definitions
+
+#ifndef mi_track_resize
+#define mi_track_resize(p,oldsize,newsize)      mi_track_free_size(p,oldsize); mi_track_malloc(p,newsize,false)
+#endif
+
+#ifndef mi_track_align
+#define mi_track_align(p,alignedp,offset,size)  mi_track_mem_noaccess(p,offset)
+#endif
+
+#ifndef mi_track_init
+#define mi_track_init()
+#endif
+
+#ifndef mi_track_mem_defined
+#define mi_track_mem_defined(p,size)
+#endif
+
+#ifndef mi_track_mem_undefined
+#define mi_track_mem_undefined(p,size)
+#endif
+
+#ifndef mi_track_mem_noaccess
+#define mi_track_mem_noaccess(p,size)
+#endif
+
+
+#if MI_PADDING
+#define mi_track_malloc(p,reqsize,zero) \
+  if ((p)!=NULL) { \
+    mi_assert_internal(mi_usable_size(p)==(reqsize)); \
+    mi_track_malloc_size(p,reqsize,reqsize,zero); \
+  }
+#else
+#define mi_track_malloc(p,reqsize,zero) \
+  if ((p)!=NULL) { \
+    mi_assert_internal(mi_usable_size(p)>=(reqsize)); \
+    mi_track_malloc_size(p,reqsize,mi_usable_size(p),zero); \
+  }
+#endif
+
+#endif
diff --git a/contrib/libs/mimalloc/include/mimalloc-types.h b/contrib/libs/mimalloc/include/mimalloc/types.h
similarity index 63%
rename from contrib/libs/mimalloc/include/mimalloc-types.h
rename to contrib/libs/mimalloc/include/mimalloc/types.h
index caf161d63f13..ed326c694e94 100644
--- a/contrib/libs/mimalloc/include/mimalloc-types.h
+++ b/contrib/libs/mimalloc/include/mimalloc/types.h
@@ -1,5 +1,5 @@
 /* ----------------------------------------------------------------------------
-Copyright (c) 2018-2021, Microsoft Research, Daan Leijen
+Copyright (c) 2018-2024, Microsoft Research, Daan Leijen
 This is free software; you can redistribute it and/or modify it under the
 terms of the MIT license. A copy of the license can be found in the file
 "LICENSE" at the root of this distribution.
@@ -8,16 +8,29 @@ terms of the MIT license. A copy of the license can be found in the file
 #ifndef MIMALLOC_TYPES_H
 #define MIMALLOC_TYPES_H
 
+// --------------------------------------------------------------------------
+// This file contains the main type definitions for mimalloc:
+// mi_heap_t      : all data for a thread-local heap, contains
+//                  lists of all managed heap pages.
+// mi_segment_t   : a larger chunk of memory (32GiB) from where pages
+//                  are allocated.
+// mi_page_t      : a mimalloc page (usually 64KiB or 512KiB) from
+//                  where objects are allocated.
+//                  Note: we write "OS page" for OS memory pages while
+//                  using plain "page" for mimalloc pages (`mi_page_t`).
+// --------------------------------------------------------------------------
+
+
 #include <stddef.h>   // ptrdiff_t
 #include <stdint.h>   // uintptr_t, uint16_t, etc
-#include "mimalloc-atomic.h"  // _Atomic
+#include "atomic.h"   // _Atomic
 
 #ifdef _MSC_VER
 #pragma warning(disable:4214) // bitfield is not int
-#endif 
+#endif
 
 // Minimal alignment necessary. On most platforms 16 bytes are needed
-// due to SSE registers for example. This must be at least `MI_INTPTR_SIZE`
+// due to SSE registers for example. This must be at least `sizeof(void*)`
 #ifndef MI_MAX_ALIGN_SIZE
 #define MI_MAX_ALIGN_SIZE  16   // sizeof(max_align_t)
 #endif
@@ -29,6 +42,11 @@ terms of the MIT license. A copy of the license can be found in the file
 // Define NDEBUG in the release version to disable assertions.
 // #define NDEBUG
 
+// Define MI_TRACK_<tool> to enable tracking support
+// #define MI_TRACK_VALGRIND 1
+// #define MI_TRACK_ASAN     1
+// #define MI_TRACK_ETW      1
+
 // Define MI_STAT as 1 to maintain statistics; set it to 2 to have detailed statistics (but costs some performance).
 // #define MI_STAT 1
 
@@ -55,18 +73,32 @@ terms of the MIT license. A copy of the license can be found in the file
 #endif
 
 // Reserve extra padding at the end of each block to be more resilient against heap block overflows.
-// The padding can detect byte-precise buffer overflow on free.
-#if !defined(MI_PADDING) && (MI_DEBUG>=1)
+// The padding can detect buffer overflow on free.
+#if !defined(MI_PADDING) && (MI_SECURE>=3 || MI_DEBUG>=1 || (MI_TRACK_VALGRIND || MI_TRACK_ASAN || MI_TRACK_ETW))
 #define MI_PADDING  1
 #endif
 
+// Check padding bytes; allows byte-precise buffer overflow detection
+#if !defined(MI_PADDING_CHECK) && MI_PADDING && (MI_SECURE>=3 || MI_DEBUG>=1)
+#define MI_PADDING_CHECK 1
+#endif
+
 
 // Encoded free lists allow detection of corrupted free lists
 // and can detect buffer overflows, modify after free, and double `free`s.
-#if (MI_SECURE>=3 || MI_DEBUG>=1 || MI_PADDING > 0)
+#if (MI_SECURE>=3 || MI_DEBUG>=1)
 #define MI_ENCODE_FREELIST  1
 #endif
 
+
+// We used to abandon huge pages in order to eagerly deallocate it if freed from another thread.
+// Unfortunately, that makes it not possible to visit them during a heap walk or include them in a
+// `mi_heap_destroy`. We therefore instead reset/decommit the huge blocks nowadays if freed from
+// another thread so the memory becomes "virtually" available (and eventually gets properly freed by
+// the owning thread).
+// #define MI_HUGE_PAGE_ABANDON 1
+
+
 // ------------------------------------------------------
 // Platform specific values
 // ------------------------------------------------------
@@ -83,20 +115,43 @@ terms of the MIT license. A copy of the license can be found in the file
 // or otherwise one might define an intptr_t type that is larger than a pointer...
 // ------------------------------------------------------
 
-#if INTPTR_MAX == 9223372036854775807LL
+#if INTPTR_MAX > INT64_MAX
+# define MI_INTPTR_SHIFT (4)  // assume 128-bit  (as on arm CHERI for example)
+#elif INTPTR_MAX == INT64_MAX
 # define MI_INTPTR_SHIFT (3)
-#elif INTPTR_MAX == 2147483647LL
+#elif INTPTR_MAX == INT32_MAX
 # define MI_INTPTR_SHIFT (2)
 #else
-#error platform must be 32 or 64 bits
+#error platform pointers must be 32, 64, or 128 bits
+#endif
+
+#if SIZE_MAX == UINT64_MAX
+# define MI_SIZE_SHIFT (3)
+typedef int64_t  mi_ssize_t;
+#elif SIZE_MAX == UINT32_MAX
+# define MI_SIZE_SHIFT (2)
+typedef int32_t  mi_ssize_t;
+#else
+#error platform objects must be 32 or 64 bits
+#endif
+
+#if (SIZE_MAX/2) > LONG_MAX
+# define MI_ZU(x)  x##ULL
+# define MI_ZI(x)  x##LL
+#else
+# define MI_ZU(x)  x##UL
+# define MI_ZI(x)  x##L
 #endif
 
 #define MI_INTPTR_SIZE  (1<<MI_INTPTR_SHIFT)
 #define MI_INTPTR_BITS  (MI_INTPTR_SIZE*8)
 
-#define KiB     ((size_t)1024)
-#define MiB     (KiB*KiB)
-#define GiB     (MiB*KiB)
+#define MI_SIZE_SIZE  (1<<MI_SIZE_SHIFT)
+#define MI_SIZE_BITS  (MI_SIZE_SIZE*8)
+
+#define MI_KiB     (MI_ZU(1024))
+#define MI_MiB     (MI_KiB*MI_KiB)
+#define MI_GiB     (MI_MiB*MI_KiB)
 
 
 // ------------------------------------------------------
@@ -105,18 +160,27 @@ terms of the MIT license. A copy of the license can be found in the file
 
 // Main tuning parameters for segment and page sizes
 // Sizes for 64-bit, divide by two for 32-bit
-#define MI_SMALL_PAGE_SHIFT               (13 + MI_INTPTR_SHIFT)      // 64kb
-#define MI_MEDIUM_PAGE_SHIFT              ( 3 + MI_SMALL_PAGE_SHIFT)  // 512kb
-#define MI_LARGE_PAGE_SHIFT               ( 3 + MI_MEDIUM_PAGE_SHIFT) // 4mb
-#define MI_SEGMENT_SHIFT                  ( MI_LARGE_PAGE_SHIFT)      // 4mb
+#ifndef MI_SMALL_PAGE_SHIFT
+#define MI_SMALL_PAGE_SHIFT               (13 + MI_INTPTR_SHIFT)      // 64KiB
+#endif
+#ifndef MI_MEDIUM_PAGE_SHIFT
+#define MI_MEDIUM_PAGE_SHIFT              ( 3 + MI_SMALL_PAGE_SHIFT)  // 512KiB
+#endif
+#ifndef MI_LARGE_PAGE_SHIFT
+#define MI_LARGE_PAGE_SHIFT               ( 3 + MI_MEDIUM_PAGE_SHIFT) // 4MiB
+#endif
+#ifndef MI_SEGMENT_SHIFT
+#define MI_SEGMENT_SHIFT                  ( MI_LARGE_PAGE_SHIFT)      // 4MiB -- must be equal to `MI_LARGE_PAGE_SHIFT`
+#endif
 
 // Derived constants
-#define MI_SEGMENT_SIZE                   (1UL<<MI_SEGMENT_SHIFT)
-#define MI_SEGMENT_MASK                   ((uintptr_t)MI_SEGMENT_SIZE - 1)
+#define MI_SEGMENT_SIZE                   (MI_ZU(1)<<MI_SEGMENT_SHIFT)
+#define MI_SEGMENT_ALIGN                  (MI_SEGMENT_SIZE)
+#define MI_SEGMENT_MASK                   ((uintptr_t)(MI_SEGMENT_ALIGN - 1))
 
-#define MI_SMALL_PAGE_SIZE                (1UL<<MI_SMALL_PAGE_SHIFT)
-#define MI_MEDIUM_PAGE_SIZE               (1UL<<MI_MEDIUM_PAGE_SHIFT)
-#define MI_LARGE_PAGE_SIZE                (1UL<<MI_LARGE_PAGE_SHIFT)
+#define MI_SMALL_PAGE_SIZE                (MI_ZU(1)<<MI_SMALL_PAGE_SHIFT)
+#define MI_MEDIUM_PAGE_SIZE               (MI_ZU(1)<<MI_MEDIUM_PAGE_SHIFT)
+#define MI_LARGE_PAGE_SIZE                (MI_ZU(1)<<MI_LARGE_PAGE_SHIFT)
 
 #define MI_SMALL_PAGES_PER_SEGMENT        (MI_SEGMENT_SIZE/MI_SMALL_PAGE_SIZE)
 #define MI_MEDIUM_PAGES_PER_SEGMENT       (MI_SEGMENT_SIZE/MI_MEDIUM_PAGE_SIZE)
@@ -124,25 +188,37 @@ terms of the MIT license. A copy of the license can be found in the file
 
 // The max object size are checked to not waste more than 12.5% internally over the page sizes.
 // (Except for large pages since huge objects are allocated in 4MiB chunks)
-#define MI_SMALL_OBJ_SIZE_MAX             (MI_SMALL_PAGE_SIZE/4)   // 16kb
-#define MI_MEDIUM_OBJ_SIZE_MAX            (MI_MEDIUM_PAGE_SIZE/4)  // 128kb
-#define MI_LARGE_OBJ_SIZE_MAX             (MI_LARGE_PAGE_SIZE/2)   // 2mb
+#define MI_SMALL_OBJ_SIZE_MAX             (MI_SMALL_PAGE_SIZE/4)   // 16KiB
+#define MI_MEDIUM_OBJ_SIZE_MAX            (MI_MEDIUM_PAGE_SIZE/4)  // 128KiB
+#define MI_LARGE_OBJ_SIZE_MAX             (MI_LARGE_PAGE_SIZE/2)   // 2MiB
 #define MI_LARGE_OBJ_WSIZE_MAX            (MI_LARGE_OBJ_SIZE_MAX/MI_INTPTR_SIZE)
-#define MI_HUGE_OBJ_SIZE_MAX              (2*MI_INTPTR_SIZE*MI_SEGMENT_SIZE)        // (must match MI_REGION_MAX_ALLOC_SIZE in memory.c)
 
 // Maximum number of size classes. (spaced exponentially in 12.5% increments)
 #define MI_BIN_HUGE  (73U)
 
 #if (MI_LARGE_OBJ_WSIZE_MAX >= 655360)
-#error "define more bins"
+#error "mimalloc internal: define more bins"
 #endif
 
-// Used as a special value to encode block sizes in 32 bits.
-#define MI_HUGE_BLOCK_SIZE   ((uint32_t)MI_HUGE_OBJ_SIZE_MAX)
+// Maximum block size for which blocks are guaranteed to be block size aligned. (see `segment.c:_mi_segment_page_start`)
+#define MI_MAX_ALIGN_GUARANTEE   (MI_MEDIUM_OBJ_SIZE_MAX)
+
+// Alignments over MI_BLOCK_ALIGNMENT_MAX are allocated in dedicated huge page segments
+#define MI_BLOCK_ALIGNMENT_MAX   (MI_SEGMENT_SIZE >> 1)
+
+// We never allocate more than PTRDIFF_MAX (see also <https://sourceware.org/ml/libc-announce/2019/msg00001.html>)
+#define MI_MAX_ALLOC_SIZE   PTRDIFF_MAX
+
+// ------------------------------------------------------
+// Mimalloc pages contain allocated blocks
+// ------------------------------------------------------
 
 // The free lists use encoded next fields
 // (Only actually encodes when MI_ENCODED_FREELIST is defined.)
-typedef uintptr_t mi_encoded_t;
+typedef uintptr_t  mi_encoded_t;
+
+// thread id's
+typedef size_t     mi_threadid_t;
 
 // free lists contain blocks
 typedef struct mi_block_s {
@@ -155,7 +231,7 @@ typedef enum mi_delayed_e {
   MI_USE_DELAYED_FREE   = 0, // push on the owning heap thread delayed list
   MI_DELAYED_FREEING    = 1, // temporary: another thread is accessing the owning heap
   MI_NO_DELAYED_FREE    = 2, // optimize: push on page local thread free queue if another block is already in the heap thread delayed free list
-  MI_NEVER_DELAYED_FREE = 3  // sticky, only resets on page reclaim
+  MI_NEVER_DELAYED_FREE = 3  // sticky: used for abondoned pages without a owning heap; this only resets on page reclaim
 } mi_delayed_t;
 
 
@@ -194,94 +270,150 @@ typedef uintptr_t mi_thread_free_t;
 // implement a monotonic heartbeat. The `thread_free` list is needed for
 // avoiding atomic operations in the common case.
 //
-//
 // `used - |thread_free|` == actual blocks that are in use (alive)
 // `used - |thread_free| + |free| + |local_free| == capacity`
 //
 // We don't count `freed` (as |free|) but use `used` to reduce
 // the number of memory accesses in the `mi_page_all_free` function(s).
 //
-// Notes: 
-// - Access is optimized for `mi_free` and `mi_page_alloc` (in `alloc.c`)
+// Notes:
+// - Access is optimized for `free.c:mi_free` and `alloc.c:mi_page_alloc`
 // - Using `uint16_t` does not seem to slow things down
-// - The size is 8 words on 64-bit which helps the page index calculations
-//   (and 10 words on 32-bit, and encoded free lists add 2 words. Sizes 10 
-//    and 12 are still good for address calculation)
-// - To limit the structure size, the `xblock_size` is 32-bits only; for 
-//   blocks > MI_HUGE_BLOCK_SIZE the size is determined from the segment page size
-// - `thread_free` uses the bottom bits as a delayed-free flags to optimize
+// - The size is 10 words on 64-bit which helps the page index calculations
+//   (and 12 words on 32-bit, and encoded free lists add 2 words)
+// - `xthread_free` uses the bottom bits as a delayed-free flags to optimize
 //   concurrent frees where only the first concurrent free adds to the owning
-//   heap `thread_delayed_free` list (see `alloc.c:mi_free_block_mt`).
+//   heap `thread_delayed_free` list (see `free.c:mi_free_block_mt`).
 //   The invariant is that no-delayed-free is only set if there is
-//   at least one block that will be added, or as already been added, to 
+//   at least one block that will be added, or as already been added, to
 //   the owning heap `thread_delayed_free` list. This guarantees that pages
 //   will be freed correctly even if only other threads free blocks.
 typedef struct mi_page_s {
   // "owned" by the segment
   uint8_t               segment_idx;       // index in the segment `pages` array, `page == &segment->pages[page->segment_idx]`
   uint8_t               segment_in_use:1;  // `true` if the segment allocated this page
-  uint8_t               is_reset:1;        // `true` if the page memory was reset
   uint8_t               is_committed:1;    // `true` if the page virtual memory is committed
-  uint8_t               is_zero_init:1;    // `true` if the page was zero initialized
+  uint8_t               is_zero_init:1;    // `true` if the page was initially zero initialized
+  uint8_t               is_huge:1;         // `true` if the page is in a huge segment
 
   // layout like this to optimize access in `mi_malloc` and `mi_free`
   uint16_t              capacity;          // number of blocks committed, must be the first field, see `segment.c:page_clear`
   uint16_t              reserved;          // number of blocks reserved in memory
   mi_page_flags_t       flags;             // `in_full` and `has_aligned` flags (8 bits)
-  uint8_t               is_zero:1;         // `true` if the blocks in the free list are zero initialized
+  uint8_t               free_is_zero:1;    // `true` if the blocks in the free list are zero initialized
   uint8_t               retire_expire:7;   // expiration count for retired blocks
 
   mi_block_t*           free;              // list of available free blocks (`malloc` allocates from this list)
-  #ifdef MI_ENCODE_FREELIST
-  uintptr_t             keys[2];           // two random keys to encode the free lists (see `_mi_block_next`)
+  mi_block_t*           local_free;        // list of deferred free blocks by this thread (migrates to `free`)
+  uint16_t              used;              // number of blocks in use (including blocks in `thread_free`)
+  uint8_t               block_size_shift;  // if not zero, then `(1 << block_size_shift) == block_size` (only used for fast path in `free.c:_mi_page_ptr_unalign`)
+  uint8_t               heap_tag;          // tag of the owning heap, used for separated heaps by object type
+                                           // padding
+  size_t                block_size;        // size available in each block (always `>0`)
+  uint8_t*              page_start;        // start of the page area containing the blocks
+
+  #if (MI_ENCODE_FREELIST || MI_PADDING)
+  uintptr_t             keys[2];           // two random keys to encode the free lists (see `_mi_block_next`) or padding canary
   #endif
-  uint32_t              used;              // number of blocks in use (including blocks in `local_free` and `thread_free`)
-  uint32_t              xblock_size;       // size available in each block (always `>0`) 
 
-  mi_block_t*           local_free;        // list of deferred free blocks by this thread (migrates to `free`)
   _Atomic(mi_thread_free_t) xthread_free;  // list of deferred free blocks freed by other threads
   _Atomic(uintptr_t)        xheap;
-  
-  struct mi_page_s*     next;              // next page owned by this thread with the same `block_size`
-  struct mi_page_s*     prev;              // previous page owned by this thread with the same `block_size`
+
+  struct mi_page_s*     next;              // next page owned by the heap with the same `block_size`
+  struct mi_page_s*     prev;              // previous page owned by the heap with the same `block_size`
+
+  #if MI_INTPTR_SIZE==4                    // pad to 12 words on 32-bit
+  void* padding[1];
+  #endif
 } mi_page_t;
 
 
 
+// ------------------------------------------------------
+// Mimalloc segments contain mimalloc pages
+// ------------------------------------------------------
+
 typedef enum mi_page_kind_e {
-  MI_PAGE_SMALL,    // small blocks go into 64kb pages inside a segment
-  MI_PAGE_MEDIUM,   // medium blocks go into 512kb pages inside a segment
+  MI_PAGE_SMALL,    // small blocks go into 64KiB pages inside a segment
+  MI_PAGE_MEDIUM,   // medium blocks go into 512KiB pages inside a segment
   MI_PAGE_LARGE,    // larger blocks go into a single page spanning a whole segment
-  MI_PAGE_HUGE      // huge blocks (>512kb) are put into a single page in a segment of the exact size (but still 2mb aligned)
+  MI_PAGE_HUGE      // a huge page is a single page in a segment of variable size (but still 2MiB aligned)
+                    // used for blocks `> MI_LARGE_OBJ_SIZE_MAX` or an aligment `> MI_BLOCK_ALIGNMENT_MAX`.
 } mi_page_kind_t;
 
-// Segments are large allocated memory blocks (2mb on 64 bit) from
-// the OS. Inside segments we allocated fixed size _pages_ that
-// contain blocks.
+
+// ---------------------------------------------------------------
+// a memory id tracks the provenance of arena/OS allocated memory
+// ---------------------------------------------------------------
+
+// Memory can reside in arena's, direct OS allocated, or statically allocated. The memid keeps track of this.
+typedef enum mi_memkind_e {
+  MI_MEM_NONE,      // not allocated
+  MI_MEM_EXTERNAL,  // not owned by mimalloc but provided externally (via `mi_manage_os_memory` for example)
+  MI_MEM_STATIC,    // allocated in a static area and should not be freed (for arena meta data for example)
+  MI_MEM_OS,        // allocated from the OS
+  MI_MEM_OS_HUGE,   // allocated as huge OS pages (usually 1GiB, pinned to physical memory)
+  MI_MEM_OS_REMAP,  // allocated in a remapable area (i.e. using `mremap`)
+  MI_MEM_ARENA      // allocated from an arena (the usual case)
+} mi_memkind_t;
+
+static inline bool mi_memkind_is_os(mi_memkind_t memkind) {
+  return (memkind >= MI_MEM_OS && memkind <= MI_MEM_OS_REMAP);
+}
+
+typedef struct mi_memid_os_info {
+  void*         base;               // actual base address of the block (used for offset aligned allocations)
+  size_t        alignment;          // alignment at allocation
+} mi_memid_os_info_t;
+
+typedef struct mi_memid_arena_info {
+  size_t        block_index;        // index in the arena
+  mi_arena_id_t id;                 // arena id (>= 1)
+  bool          is_exclusive;       // this arena can only be used for specific arena allocations
+} mi_memid_arena_info_t;
+
+typedef struct mi_memid_s {
+  union {
+    mi_memid_os_info_t    os;       // only used for MI_MEM_OS
+    mi_memid_arena_info_t arena;    // only used for MI_MEM_ARENA
+  } mem;
+  bool          is_pinned;          // `true` if we cannot decommit/reset/protect in this memory (e.g. when allocated using large (2Mib) or huge (1GiB) OS pages)
+  bool          initially_committed;// `true` if the memory was originally allocated as committed
+  bool          initially_zero;     // `true` if the memory was originally zero initialized
+  mi_memkind_t  memkind;
+} mi_memid_t;
+
+
+// ---------------------------------------------------------------
+// Segments contain mimalloc pages
+// ---------------------------------------------------------------
+
+// Segments are large allocated memory blocks (2MiB on 64 bit) from the OS.
+// Inside segments we allocated fixed size _pages_ that contain blocks.
 typedef struct mi_segment_s {
-  // memory fields
-  size_t               memid;            // id for the os-level memory manager
-  bool                 mem_is_pinned;    // `true` if we cannot decommit/reset/protect in this memory (i.e. when allocated using large OS pages)
-  bool                 mem_is_committed; // `true` if the whole segment is eagerly committed  
+  // constant fields
+  mi_memid_t           memid;            // memory id to track provenance
+  bool                 allow_decommit;
+  bool                 allow_purge;
+  size_t               segment_size;     // for huge pages this may be different from `MI_SEGMENT_SIZE`
 
   // segment fields
-  _Atomic(struct mi_segment_s*) abandoned_next;
   struct mi_segment_s* next;             // must be the first segment field after abandoned_next -- see `segment.c:segment_init`
   struct mi_segment_s* prev;
+  bool                 was_reclaimed;    // true if it was reclaimed (used to limit on-free reclamation)
 
   size_t               abandoned;        // abandoned pages (i.e. the original owning thread stopped) (`abandoned <= used`)
   size_t               abandoned_visits; // count how often this segment is visited in the abandoned list (to force reclaim if it is too long)
 
   size_t               used;             // count of pages in use (`used <= capacity`)
   size_t               capacity;         // count of available pages (`#free + used`)
-  size_t               segment_size;     // for huge pages this may be different from `MI_SEGMENT_SIZE`
   size_t               segment_info_size;// space we are using from the first page for segment meta-data and possible guard pages.
   uintptr_t            cookie;           // verify addresses in secure mode: `_mi_ptr_cookie(segment) == segment->cookie`
 
   // layout like this to optimize access in `mi_free`
+  _Atomic(mi_threadid_t) thread_id;      // unique id of the thread owning this segment
   size_t               page_shift;       // `1 << page_shift` == the page sizes == `page->block_size * page->reserved` (unless the first page, then `-segment_info_size`).
-  _Atomic(uintptr_t)   thread_id;        // unique id of the thread owning this segment
-  mi_page_kind_t       page_kind;        // kind of pages: small, large, or huge
+  mi_page_kind_t       page_kind;        // kind of pages: small, medium, large, or huge
   mi_page_t            pages[1];         // up to `MI_SMALL_PAGES_PER_SEGMENT` pages
 } mi_segment_t;
 
@@ -316,10 +448,11 @@ typedef struct mi_random_cxt_s {
   uint32_t input[16];
   uint32_t output[16];
   int      output_available;
+  bool     weak;
 } mi_random_ctx_t;
 
 
-// In debug mode there is a padding stucture at the end of the blocks to check for buffer overflows
+// In debug mode there is a padding structure at the end of the blocks to check for buffer overflows
 #if (MI_PADDING)
 typedef struct mi_padding_s {
   uint32_t canary; // encoded block value to check validity of the padding (in case of overflow)
@@ -338,10 +471,9 @@ typedef struct mi_padding_s {
 // A heap owns a set of pages.
 struct mi_heap_s {
   mi_tld_t*             tld;
-  mi_page_t*            pages_free_direct[MI_PAGES_DIRECT];  // optimize: array where every entry points a page with possibly free blocks in the corresponding queue for that size.
-  mi_page_queue_t       pages[MI_BIN_FULL + 1];              // queue of pages for each size class (or "bin")
   _Atomic(mi_block_t*)  thread_delayed_free;
-  uintptr_t             thread_id;                           // thread this heap belongs too
+  mi_threadid_t         thread_id;                           // thread this heap belongs too
+  mi_arena_id_t         arena_id;                            // arena id if the heap belongs to a specific arena (or 0)
   uintptr_t             cookie;                              // random cookie to verify pointers (see `_mi_ptr_cookie`)
   uintptr_t             keys[2];                             // two random keys used to encode the `thread_delayed_free` list
   mi_random_ctx_t       random;                              // random number context used for secure allocation
@@ -350,6 +482,9 @@ struct mi_heap_s {
   size_t                page_retired_max;                    // largest retired index into the `pages` array.
   mi_heap_t*            next;                                // list of heaps per thread
   bool                  no_reclaim;                          // `true` if this heap should not reclaim abandoned pages
+  uint8_t               tag;                                 // custom tag, can be used for separating heaps based on the object types
+  mi_page_t*            pages_free_direct[MI_PAGES_DIRECT];  // optimize: array where every entry points a page with possibly free blocks in the corresponding queue for that size.
+  mi_page_queue_t       pages[MI_BIN_FULL + 1];              // queue of pages for each size class (or "bin")
 };
 
 
@@ -358,9 +493,15 @@ struct mi_heap_s {
 // Debug
 // ------------------------------------------------------
 
+#if !defined(MI_DEBUG_UNINIT)
 #define MI_DEBUG_UNINIT     (0xD0)
+#endif
+#if !defined(MI_DEBUG_FREED)
 #define MI_DEBUG_FREED      (0xDF)
+#endif
+#if !defined(MI_DEBUG_PADDING)
 #define MI_DEBUG_PADDING    (0xDE)
+#endif
 
 #if (MI_DEBUG)
 // use our own assertion to print without memory allocation
@@ -412,6 +553,7 @@ typedef struct mi_stats_s {
   mi_stat_count_t reserved;
   mi_stat_count_t committed;
   mi_stat_count_t reset;
+  mi_stat_count_t purged;
   mi_stat_count_t page_committed;
   mi_stat_count_t segments_abandoned;
   mi_stat_count_t pages_abandoned;
@@ -424,11 +566,15 @@ typedef struct mi_stats_s {
   mi_stat_counter_t pages_extended;
   mi_stat_counter_t mmap_calls;
   mi_stat_counter_t commit_calls;
+  mi_stat_counter_t reset_calls;
+  mi_stat_counter_t purge_calls;
   mi_stat_counter_t page_no_retire;
   mi_stat_counter_t searches;
   mi_stat_counter_t normal_count;
   mi_stat_counter_t huge_count;
-  mi_stat_counter_t giant_count;
+  mi_stat_counter_t arena_count;
+  mi_stat_counter_t arena_crossover_count;
+  mi_stat_counter_t arena_rollback_count;
 #if MI_STAT>1
   mi_stat_count_t normal_bins[MI_BIN_HUGE+1];
 #endif
@@ -453,6 +599,7 @@ void _mi_stat_counter_increase(mi_stat_counter_t* stat, size_t amount);
 #define mi_heap_stat_increase(heap,stat,amount)  mi_stat_increase( (heap)->tld->stats.stat, amount)
 #define mi_heap_stat_decrease(heap,stat,amount)  mi_stat_decrease( (heap)->tld->stats.stat, amount)
 
+
 // ------------------------------------------------------
 // Thread Local data
 // ------------------------------------------------------
@@ -475,14 +622,12 @@ typedef struct mi_os_tld_s {
 typedef struct mi_segments_tld_s {
   mi_segment_queue_t  small_free;   // queue of segments with free small pages
   mi_segment_queue_t  medium_free;  // queue of segments with free medium pages
-  mi_page_queue_t     pages_reset;  // queue of freed pages that can be reset
+  mi_page_queue_t     pages_purge;  // queue of freed pages that are delay purged
   size_t              count;        // current number of segments;
   size_t              peak_count;   // peak number of segments
   size_t              current_size; // current size of all segments
   size_t              peak_size;    // peak size of all segments
-  size_t              cache_count;  // number of segments in the cache
-  size_t              cache_size;   // total size of all segments in the cache
-  mi_segment_t*       cache;        // (small) cache of segments
+  size_t              reclaim_count;// number of reclaimed (abandoned) segments
   mi_stats_t*         stats;        // points to tld stats
   mi_os_tld_t*        os;           // points to os stats
 } mi_segments_tld_t;
diff --git a/contrib/libs/mimalloc/readme.md b/contrib/libs/mimalloc/readme.md
index cdb1b82aad18..a0296b43c35a 100644
--- a/contrib/libs/mimalloc/readme.md
+++ b/contrib/libs/mimalloc/readme.md
@@ -9,24 +9,28 @@
 
 mimalloc (pronounced "me-malloc")
 is a general purpose allocator with excellent [performance](#performance) characteristics.
-Initially developed by Daan Leijen for the run-time systems of the
+Initially developed by Daan Leijen for the runtime systems of the
 [Koka](https://koka-lang.github.io) and [Lean](https://github.com/leanprover/lean) languages.
 
-Latest release tag: `v2.0.2` (beta, 2021-06-17).  
-Latest stable  tag: `v1.7.2` (2021-06-17).
+Latest release tag: `v2.1.7` (2024-05-21).  
+Latest v1 tag: `v1.8.7` (2024-05-21).
 
 mimalloc is a drop-in replacement for `malloc` and can be used in other programs
 without code changes, for example, on dynamically linked ELF-based systems (Linux, BSD, etc.) you can use it as:
 ```
-> LD_PRELOAD=/usr/bin/libmimalloc.so  myprogram
+> LD_PRELOAD=/usr/lib/libmimalloc.so  myprogram
 ```
-It also has an easy way to override the default allocator in [Windows](#override_on_windows). Notable aspects of the design include:
+It also includes a robust way to override the default allocator in [Windows](#override_on_windows). Notable aspects of the design include:
 
 - __small and consistent__: the library is about 8k LOC using simple and
   consistent data structures. This makes it very suitable
   to integrate and adapt in other projects. For runtime systems it
   provides hooks for a monotonic _heartbeat_ and deferred freeing (for
   bounded worst-case times with reference counting).
+  Partly due to its simplicity, mimalloc has been ported to many systems (Windows, macOS,
+  Linux, WASM, various BSD's, Haiku, MUSL, etc) and has excellent support for dynamic overriding.
+  At the same time, it is an industrial strength allocator that runs (very) large scale
+  distributed services on thousands of machines with excellent worst case latencies.
 - __free list sharding__: instead of one big free list (per size class) we have
   many smaller lists per "mimalloc page" which reduces fragmentation and
   increases locality --
@@ -36,13 +40,13 @@ It also has an easy way to override the default allocator in [Windows](#override
   per mimalloc page, but for each page we have multiple free lists. In particular, there
   is one list for thread-local `free` operations, and another one for concurrent `free`
   operations. Free-ing from another thread can now be a single CAS without needing
-  sophisticated coordination between threads. Since there will be 
+  sophisticated coordination between threads. Since there will be
   thousands of separate free lists, contention is naturally distributed over the heap,
   and the chance of contending on a single location will be low -- this is quite
   similar to randomized algorithms like skip lists where adding
   a random oracle removes the need for a more complex algorithm.
-- __eager page reset__: when a "page" becomes empty (with increased chance
-  due to free list sharding) the memory is marked to the OS as unused ("reset" or "purged")
+- __eager page purging__: when a "page" becomes empty (with increased chance
+  due to free list sharding) the memory is marked to the OS as unused (reset or decommitted)
   reducing (real) memory pressure and fragmentation, especially in long running
   programs.
 - __secure__: _mimalloc_ can be built in secure mode, adding guard pages,
@@ -50,71 +54,79 @@ It also has an easy way to override the default allocator in [Windows](#override
   heap vulnerabilities. The performance penalty is usually around 10% on average
   over our benchmarks.
 - __first-class heaps__: efficiently create and use multiple heaps to allocate across different regions.
-  A heap can be destroyed at once instead of deallocating each object separately.  
+  A heap can be destroyed at once instead of deallocating each object separately.
 - __bounded__: it does not suffer from _blowup_ \[1\], has bounded worst-case allocation
-  times (_wcat_), bounded space overhead (~0.2% meta-data, with at most 12.5% waste in allocation sizes),
-  and has no internal points of contention using only atomic operations.
+  times (_wcat_) (upto OS primitives), bounded space overhead (~0.2% meta-data, with low
+  internal fragmentation), and has no internal points of contention using only atomic operations.
 - __fast__: In our benchmarks (see [below](#performance)),
   _mimalloc_ outperforms other leading allocators (_jemalloc_, _tcmalloc_, _Hoard_, etc),
-  and often uses less memory. A nice property
-  is that it does consistently well over a wide range of benchmarks. There is also good huge OS page
-  support for larger server programs.
+  and often uses less memory. A nice property is that it does consistently well over a wide range
+  of benchmarks. There is also good huge OS page support for larger server programs.
 
 The [documentation](https://microsoft.github.io/mimalloc) gives a full overview of the API.
-You can read more on the design of _mimalloc_ in the [technical report](https://www.microsoft.com/en-us/research/publication/mimalloc-free-list-sharding-in-action) which also has detailed benchmark results.   
+You can read more on the design of _mimalloc_ in the [technical report](https://www.microsoft.com/en-us/research/publication/mimalloc-free-list-sharding-in-action) which also has detailed benchmark results.
 
-Enjoy!  
+Enjoy!
 
 ### Branches
 
-* `master`: latest stable release.
-* `dev`: development branch for mimalloc v1.
-* `dev-slice`: development branch for mimalloc v2 with a new algorithm for managing internal mimalloc pages.
+* `master`: latest stable release (based on `dev-slice`).
+* `dev`: development branch for mimalloc v1. Use this branch for submitting PR's.
+* `dev-slice`: development branch for mimalloc v2. This branch is downstream of `dev` (and is essentially equal to `dev` except for
+`src/segment.c`)
 
 ### Releases
 
-Note: the `v2.x` beta has a new algorithm for managing internal mimalloc pages that tends to use reduce memory usage
+Note: the `v2.x` version has a different algorithm for managing internal mimalloc pages (as slices) that tends to use reduce 
+memory usage
   and fragmentation compared to mimalloc `v1.x` (especially for large workloads). Should otherwise have similar performance
   (see [below](#performance)); please report if you observe any significant performance regression.
 
-* 2021-06-17, `v1.7.2`, `v2.0.2` (beta): support M1, better installation layout on Linux, fix
-  thread_id on Android, prefer 2-6TiB area for aligned allocation to work better on pre-windows 8, various small fixes.
-
-* 2021-04-06, `v1.7.1`, `v2.0.1` (beta): fix bug in arena allocation for huge pages, improved aslr on large allocations, initial M1 support (still experimental).
+* 2024-05-21, `v1.8.7`, `v2.1.7`: Fix build issues on less common platforms. Started upstreaming patches
+  from the CPython [integration](https://github.com/python/cpython/issues/113141#issuecomment-2119255217). Upstream `vcpkg` patches.
+* 2024-05-13, `v1.8.6`, `v2.1.6`: Fix build errors on various (older) platforms. Refactored aligned allocation.
+* 2024-04-22, `v1.8.4`, `v2.1.4`: Fixes various bugs and build issues. Add `MI_LIBC_MUSL` cmake flag for musl builds.
+  Free-ing code is refactored into a separate module (`free.c`). Mimalloc page info is simplified with the block size
+  directly available (and new `block_size_shift` to improve aligned block free-ing). 
+  New approach to collection of abandoned segments: When
+  a thread terminates the segments it owns are abandoned (containing still live objects) and these can be
+  reclaimed by other threads. We no longer use a list of abandoned segments but this is now done using bitmaps in arena's 
+  which is more concurrent (and more aggressive). Abandoned memory can now also be reclaimed if a thread frees an object in
+  an abandoned page (which can be disabled using `mi_option_abandoned_reclaim_on_free`). The option `mi_option_max_segment_reclaim`
+  gives a maximum percentage of abandoned segments that can be reclaimed per try (=10%).
+
+* 2023-04-24, `v1.8.2`, `v2.1.2`: Fixes build issues on freeBSD, musl, and C17 (UE 5.1.1). Reduce code size/complexity 
+  by removing regions and segment-cache's and only use arenas with improved memory purging -- this may improve memory
+  usage as well for larger services. Renamed options for consistency. Improved Valgrind and ASAN checking.
   
-* 2021-01-31, `v2.0.0`: beta release 2.0: new slice algorithm for managing internal mimalloc pages.
-  
-* 2021-01-31, `v1.7.0`: stable release 1.7: support explicit user provided memory regions, more precise statistics,
-  improve macOS overriding, initial support for Apple M1, improved DragonFly support, faster memcpy on Windows, various small fixes.
+* 2023-04-03, `v1.8.1`, `v2.1.1`: Fixes build issues on some platforms.
 
-### Older Releases
+* 2023-03-29, `v1.8.0`, `v2.1.0`: Improved support dynamic overriding on Windows 11. Improved tracing precision
+  with [asan](#asan) and [Valgrind](#valgrind), and added Windows event tracing [ETW](#ETW) (contributed by Xinglong He). Created an OS
+  abstraction layer to make it easier to port and separate platform dependent code (in `src/prim`). Fixed C++ STL compilation on older Microsoft C++ compilers, and various small bug fixes.
 
-* 2020-09-24, `v1.6.7`: stable release 1.6: using standard C atomics, passing tsan testing, improved
-  handling of failing to commit on Windows, add [`mi_process_info`](https://github.com/microsoft/mimalloc/blob/master/include/mimalloc.h#L156) api call.
-* 2020-08-06, `v1.6.4`: stable release 1.6: improved error recovery in low-memory situations,
-  support for IllumOS and Haiku, NUMA support for Vista/XP, improved NUMA detection for AMD Ryzen, ubsan support.
-* 2020-05-05, `v1.6.3`: stable release 1.6: improved behavior in out-of-memory situations, improved malloc zones on macOS,
-  build PIC static libraries by default, add option to abort on out-of-memory, line buffered statistics.
-* 2020-04-20, `v1.6.2`: stable release 1.6: fix compilation on Android, MingW, Raspberry, and Conda,
-  stability fix for Windows 7, fix multiple mimalloc instances in one executable, fix `strnlen` overload,
-  fix aligned debug padding.
-* 2020-02-17, `v1.6.1`: stable release 1.6: minor updates (build with clang-cl, fix alignment issue for small objects).
-* 2020-02-09, `v1.6.0`: stable release 1.6: fixed potential memory leak, improved overriding
-  and thread local support on FreeBSD, NetBSD, DragonFly, and macOSX. New byte-precise
-  heap block overflow detection in debug mode (besides the double-free detection and free-list
-  corruption detection). Add `nodiscard` attribute to most allocation functions.
-  Enable `MIMALLOC_PAGE_RESET` by default. New reclamation strategy for abandoned heap pages
-  for better memory footprint.
-* 2020-02-09, `v1.5.0`: stable release 1.5: improved free performance, small bug fixes.
-* 2020-01-22, `v1.4.0`: stable release 1.4: improved performance for delayed OS page reset,
-more eager concurrent free, addition of STL allocator, fixed potential memory leak.
-* 2020-01-15, `v1.3.0`: stable release 1.3: bug fixes, improved randomness and [stronger
-free list encoding](https://github.com/microsoft/mimalloc/blob/783e3377f79ee82af43a0793910a9f2d01ac7863/include/mimalloc-internal.h#L396) in secure mode.
-* 2019-12-22, `v1.2.2`: stable release 1.2: minor updates.
-* 2019-11-22, `v1.2.0`: stable release 1.2: bug fixes, improved secure mode (free list corruption checks, double free mitigation). Improved dynamic overriding on Windows.
-* 2019-10-07, `v1.1.0`: stable release 1.1.
-* 2019-09-01, `v1.0.8`: pre-release 8: more robust windows dynamic overriding, initial huge page support.
-* 2019-08-10, `v1.0.6`: pre-release 6: various performance improvements.
+* 2022-12-23, `v1.7.9`, `v2.0.9`: Supports building with [asan](#asan) and improved [Valgrind](#valgrind) support.
+  Support arbitrary large alignments (in particular for `std::pmr` pools). 
+  Added C++ STL allocators attached to a specific heap (thanks @vmarkovtsev). 
+  Heap walks now visit all object (including huge objects). Support Windows nano server containers (by Johannes Schindelin,@dscho). 
+  Various small bug fixes.
+
+* 2022-11-03, `v1.7.7`, `v2.0.7`: Initial support for [Valgrind](#valgrind) for leak testing and heap block overflow
+  detection. Initial
+  support for attaching heaps to a speficic memory area (only in v2). Fix `realloc` behavior for zero size blocks, remove restriction to integral multiple of the alignment in `alloc_align`, improved aligned allocation performance, reduced contention with many threads on few processors (thank you @dposluns!), vs2022 support, support `pkg-config`, .
+
+* 2022-04-14, `v1.7.6`, `v2.0.6`: fix fallback path for aligned OS allocation on Windows, improve Windows aligned allocation
+  even when compiling with older SDK's, fix dynamic overriding on macOS Monterey, fix MSVC C++ dynamic overriding, fix
+  warnings under Clang 14, improve performance if many OS threads are created and destroyed, fix statistics for large object
+  allocations, using MIMALLOC_VERBOSE=1 has no maximum on the number of error messages, various small fixes.
+
+* 2022-02-14, `v1.7.5`, `v2.0.5` (alpha): fix malloc override on
+  Windows 11, fix compilation with musl, potentially reduced
+  committed memory, add `bin/minject` for Windows,
+  improved wasm support, faster aligned allocation,
+  various small fixes.
+
+* [Older release notes](#older-release-notes)
 
 Special thanks to:
 
@@ -124,9 +136,11 @@ Special thanks to:
   memory model bugs using the [genMC] model checker.
 * Weipeng Liu (@pongba), Zhuowei Li, Junhua Wang, and Jakub Szymanski, for their early support of mimalloc and deployment
   at large scale services, leading to many improvements in the mimalloc algorithms for large workloads.
-* Jason Gibson (@jasongibson) for exhaustive testing on large scale workloads and server environments, and finding complex bugs 
+* Jason Gibson (@jasongibson) for exhaustive testing on large scale workloads and server environments, and finding complex bugs
   in (early versions of) `mimalloc`.
-* Manuel Pöter (@mpoeter) and Sam Gross (@colesbury) for finding an ABA concurrency issue in abandoned segment reclamation.
+* Manuel Pöter (@mpoeter) and Sam Gross(@colesbury) for finding an ABA concurrency issue in abandoned segment reclamation. Sam also created the [no GIL](https://github.com/colesbury/nogil) Python fork which
+  uses mimalloc internally.
+
 
 [genMC]: https://plv.mpi-sws.org/genmc/
 
@@ -134,15 +148,18 @@ Special thanks to:
 
 mimalloc is used in various large scale low-latency services and programs, for example:
 
-<a href="https://www.bing.com"><img align="left"  height="50" src="https://upload.wikimedia.org/wikipedia/commons/e/e9/Bing_logo.svg"></a>
-<a href="https://azure.microsoft.com/"><img align="left" height="50" src="https://upload.wikimedia.org/wikipedia/commons/a/a8/Microsoft_Azure_Logo.svg"></a>
-<a href="https://deathstrandingpc.505games.com"><img height="100" src="doc/ds-logo.jpg" style="border-radius=1ex;vertical-align:center"></a>
+<a href="https://www.bing.com"><img height="50" align="left" src="https://upload.wikimedia.org/wikipedia/commons/e/e9/Bing_logo.svg"></a>
+<a href="https://azure.microsoft.com/"><img height="50" align="left" src="https://upload.wikimedia.org/wikipedia/commons/a/a8/Microsoft_Azure_Logo.svg"></a>
+<a href="https://deathstrandingpc.505games.com"><img height="100" src="doc/ds-logo.png"></a>
+<a href="https://docs.unrealengine.com/4.26/en-US/WhatsNew/Builds/ReleaseNotes/4_25/"><img height="100" src="doc/unreal-logo.svg"></a>
+<a href="https://cab.spbu.ru/software/spades/"><img height="100" src="doc/spades-logo.png"></a>
+
 
 # Building
 
 ## Windows
 
-Open `ide/vs2019/mimalloc.sln` in Visual Studio 2019 and build (or `ide/vs2017/mimalloc.sln`).
+Open `ide/vs2022/mimalloc.sln` in Visual Studio 2022 and build.
 The `mimalloc` project builds a static library (in `out/msvc-x64`), while the
 `mimalloc-override` project builds a DLL for overriding malloc
 in the entire program.
@@ -191,6 +208,11 @@ Notes:
 2. Install CCMake: `sudo apt-get install cmake-curses-gui`
 
 
+## Single source
+
+You can also directly build the single `src/static.c` file as part of your project without
+needing `cmake` at all. Make sure to also add the mimalloc `include` directory to the include path.
+
 
 # Using the library
 
@@ -217,7 +239,7 @@ target_link_libraries(myapp PUBLIC mimalloc-static)
 to link with the static library. See `test\CMakeLists.txt` for an example.
 
 For best performance in C++ programs, it is also recommended to override the
-global `new` and `delete` operators. For convience, mimalloc provides
+global `new` and `delete` operators. For convenience, mimalloc provides
 [`mimalloc-new-delete.h`](https://github.com/microsoft/mimalloc/blob/master/include/mimalloc-new-delete.h) which does this for you -- just include it in a single(!) source file in your project.
 In C++, mimalloc also provides the `mi_stl_allocator` struct which implements the `std::allocator`
 interface.
@@ -265,47 +287,63 @@ completely and redirect all calls to the _mimalloc_ library instead .
 
 ## Environment Options
 
-You can set further options either programmatically (using [`mi_option_set`](https://microsoft.github.io/mimalloc/group__options.html)),
-or via environment variables:
+You can set further options either programmatically (using [`mi_option_set`](https://microsoft.github.io/mimalloc/group__options.html)), or via environment variables:
 
 - `MIMALLOC_SHOW_STATS=1`: show statistics when the program terminates.
 - `MIMALLOC_VERBOSE=1`: show verbose messages.
 - `MIMALLOC_SHOW_ERRORS=1`: show error and warning messages.
-- `MIMALLOC_PAGE_RESET=0`: by default, mimalloc will reset (or purge) OS pages that are not in use, to signal to the OS
-   that the underlying physical memory can be reused. This can reduce memory fragmentation in long running (server)
-   programs. By setting it to `0` this will no longer be done which can improve performance for batch-like programs.
-   As an alternative, the `MIMALLOC_RESET_DELAY=`<msecs> can be set higher (100ms by default) to make the page
-   reset occur less frequently instead of turning it off completely.
+
+Advanced options:
+
+- `MIMALLOC_ARENA_EAGER_COMMIT=2`: turns on eager commit for the large arenas (usually 1GiB) from which mimalloc 
+   allocates segments and pages. Set this to 2 (default) to 
+   only enable this on overcommit systems (e.g. Linux). Set this to 1 to enable explicitly on other systems 
+   as well (like Windows or macOS) which may improve performance (as the whole arena is committed at once). 
+   Note that eager commit only increases the commit but not the actual the peak resident set 
+   (rss) so it is generally ok to enable this.
+- `MIMALLOC_PURGE_DELAY=N`: the delay in `N` milli-seconds (by default `10`) after which mimalloc will purge 
+   OS pages that are not in use. This signals to the OS that the underlying physical memory can be reused which 
+   can reduce memory fragmentation especially in long running (server) programs. Setting `N` to `0` purges immediately when
+   a page becomes unused which can improve memory usage but also decreases performance. Setting `N` to a higher
+   value like `100` can improve performance (sometimes by a lot) at the cost of potentially using more memory at times.
+   Setting it to `-1` disables purging completely.
+- `MIMALLOC_PURGE_DECOMMITS=1`: By default "purging" memory means unused memory is decommitted (`MEM_DECOMMIT` on Windows,
+   `MADV_DONTNEED` (which decresease rss immediately) on `mmap` systems). Set this to 0 to instead "reset" unused
+   memory on a purge (`MEM_RESET` on Windows, generally `MADV_FREE` (which does not decrease rss immediately) on `mmap` systems).
+   Mimalloc generally does not "free" OS memory but only "purges" OS memory, in other words, it tries to keep virtual 
+   address ranges and decommits within those ranges (to make the underlying physical memory available to other processes).
+
+Further options for large workloads and services:
+
 - `MIMALLOC_USE_NUMA_NODES=N`: pretend there are at most `N` NUMA nodes. If not set, the actual NUMA nodes are detected
    at runtime. Setting `N` to 1 may avoid problems in some virtual environments. Also, setting it to a lower number than
    the actual NUMA nodes is fine and will only cause threads to potentially allocate more memory across actual NUMA
    nodes (but this can happen in any case as NUMA local allocation is always a best effort but not guaranteed).
-- `MIMALLOC_LARGE_OS_PAGES=1`: use large OS pages (2MiB) when available; for some workloads this can significantly
-   improve performance. Use `MIMALLOC_VERBOSE` to check if the large OS pages are enabled -- usually one needs
-   to explicitly allow large OS pages (as on [Windows][windows-huge] and [Linux][linux-huge]). However, sometimes
+- `MIMALLOC_ALLOW_LARGE_OS_PAGES=1`: use large OS pages (2 or 4MiB) when available; for some workloads this can significantly
+   improve performance. When this option is disabled, it also disables transparent huge pages (THP) for the process 
+   (on Linux and Android). Use `MIMALLOC_VERBOSE` to check if the large OS pages are enabled -- usually one needs
+   to explicitly give permissions for large OS pages (as on [Windows][windows-huge] and [Linux][linux-huge]). However, sometimes
    the OS is very slow to reserve contiguous physical memory for large OS pages so use with care on systems that
-   can have fragmented memory (for that reason, we generally recommend to use `MIMALLOC_RESERVE_HUGE_OS_PAGES` instead whenever possible).
-   <!--
-   - `MIMALLOC_EAGER_REGION_COMMIT=1`: on Windows, commit large (256MiB) regions eagerly. On Windows, these regions
-   show in the working set even though usually just a small part is committed to physical memory. This is why it
-   turned off by default on Windows as it looks not good in the task manager. However, turning it on has no
-   real drawbacks and may improve performance by a little.
-   -->
-- `MIMALLOC_RESERVE_HUGE_OS_PAGES=N`: where N is the number of 1GiB _huge_ OS pages. This reserves the huge pages at
+   can have fragmented memory (for that reason, we generally recommend to use `MIMALLOC_RESERVE_HUGE_OS_PAGES` instead whenever possible).   
+- `MIMALLOC_RESERVE_HUGE_OS_PAGES=N`: where `N` is the number of 1GiB _huge_ OS pages. This reserves the huge pages at
    startup and sometimes this can give a large (latency) performance improvement on big workloads.
-   Usually it is better to not use
-   `MIMALLOC_LARGE_OS_PAGES` in combination with this setting. Just like large OS pages, use with care as reserving
+   Usually it is better to not use `MIMALLOC_ALLOW_LARGE_OS_PAGES=1` in combination with this setting. Just like large 
+   OS pages, use with care as reserving
    contiguous physical memory can take a long time when memory is fragmented (but reserving the huge pages is done at
    startup only once).
-   Note that we usually need to explicitly enable huge OS pages (as on [Windows][windows-huge] and [Linux][linux-huge])).
+   Note that we usually need to explicitly give permission for huge OS pages (as on [Windows][windows-huge] and [Linux][linux-huge])).
    With huge OS pages, it may be beneficial to set the setting
    `MIMALLOC_EAGER_COMMIT_DELAY=N` (`N` is 1 by default) to delay the initial `N` segments (of 4MiB)
    of a thread to not allocate in the huge OS pages; this prevents threads that are short lived
-   and allocate just a little to take up space in the huge OS page area (which cannot be reset).
+   and allocate just a little to take up space in the huge OS page area (which cannot be purged as huge OS pages are pinned
+   to physical memory).
+   The huge pages are usually allocated evenly among NUMA nodes.
+   We can use `MIMALLOC_RESERVE_HUGE_OS_PAGES_AT=N` where `N` is the numa node (starting at 0) to allocate all
+   the huge pages at a specific numa node instead.
 
 Use caution when using `fork` in combination with either large or huge OS pages: on a fork, the OS uses copy-on-write
 for all pages in the original process including the huge OS pages. When any memory is now written in that area, the
-OS will copy the entire 1GiB huge page (or 2MiB large page) which can cause the memory usage to grow in big increments.
+OS will copy the entire 1GiB huge page (or 2MiB large page) which can cause the memory usage to grow in large increments.
 
 [linux-huge]: https://access.redhat.com/documentation/en-us/red_hat_enterprise_linux/5/html/tuning_and_optimizing_red_hat_enterprise_linux_for_oracle_9i_and_10g_databases/sect-oracle_9i_and_10g_tuning_guide-large_memory_optimization_big_pages_and_huge_pages-configuring_huge_pages_in_red_hat_enterprise_linux_4_or_5
 [windows-huge]: https://docs.microsoft.com/en-us/sql/database-engine/configure-windows/enable-the-lock-pages-in-memory-option-windows?view=sql-server-2017
@@ -337,15 +375,15 @@ When _mimalloc_ is built using debug mode, various checks are done at runtime to
 - Corrupted free-lists and some forms of use-after-free are detected.
 
 
-# Overriding Malloc
+# Overriding Standard Malloc
 
-Overriding the standard `malloc` can be done either _dynamically_ or _statically_.
+Overriding the standard `malloc` (and `new`) can be done either _dynamically_ or _statically_.
 
 ## Dynamic override
 
 This is the recommended way to override the standard malloc interface.
 
-### Override on Linux, BSD
+### Dynamic Override on Linux, BSD
 
 On these ELF-based systems we preload the mimalloc shared
 library so all calls to the standard `malloc` interface are
@@ -364,60 +402,68 @@ or run with the debug version to get detailed statistics:
 > env MIMALLOC_SHOW_STATS=1 LD_PRELOAD=/usr/lib/libmimalloc-debug.so myprogram
 ```
 
-### Override on MacOS
+### Dynamic Override on MacOS
 
 On macOS we can also preload the mimalloc shared
 library so all calls to the standard `malloc` interface are
 resolved to the _mimalloc_ library.
 ```
-> env DYLD_FORCE_FLAT_NAMESPACE=1 DYLD_INSERT_LIBRARIES=/usr/lib/libmimalloc.dylib myprogram
+> env DYLD_INSERT_LIBRARIES=/usr/lib/libmimalloc.dylib myprogram
 ```
 
 Note that certain security restrictions may apply when doing this from
 the [shell](https://stackoverflow.com/questions/43941322/dyld-insert-libraries-ignored-when-calling-application-through-bash).
 
-(Note: macOS support for dynamic overriding is recent, please report any issues.)
 
-### Override on Windows
+### Dynamic Override on Windows
 
-<span id="override_on_windows">Overriding on Windows</span> is robust and has the
-particular advantage to be able to redirect all malloc/free calls that go through
+<span id="override_on_windows">Dynamically overriding on mimalloc on Windows</span> 
+is robust and has the particular advantage to be able to redirect all malloc/free calls that go through
 the (dynamic) C runtime allocator, including those from other DLL's or libraries.
-
-The overriding on Windows requires that you link your program explicitly with
-the mimalloc DLL and use the C-runtime library as a DLL (using the `/MD` or `/MDd` switch).
-Also, the `mimalloc-redirect.dll` (or `mimalloc-redirect32.dll`) must be available
-in the same folder as the main `mimalloc-override.dll` at runtime (as it is a dependency).
-The redirection DLL ensures that all calls to the C runtime malloc API get redirected to
-mimalloc (in `mimalloc-override.dll`).
-
-To ensure the mimalloc DLL is loaded at run-time it is easiest to insert some
-call to the mimalloc API in the `main` function, like `mi_version()`
-(or use the `/INCLUDE:mi_version` switch on the linker). See the `mimalloc-override-test` project
-for an example on how to use this. For best performance on Windows with C++, it
+As it intercepts all allocation calls on a low level, it can be used reliably 
+on large programs that include other 3rd party components.
+There are four requirements to make the overriding work robustly:
+
+1. Use the C-runtime library as a DLL (using the `/MD` or `/MDd` switch).
+2. Link your program explicitly with `mimalloc-override.dll` library.
+   To ensure the `mimalloc-override.dll` is loaded at run-time it is easiest to insert some
+    call to the mimalloc API in the `main` function, like `mi_version()`
+    (or use the `/INCLUDE:mi_version` switch on the linker). See the `mimalloc-override-test` project
+    for an example on how to use this. 
+3. The [`mimalloc-redirect.dll`](bin) (or `mimalloc-redirect32.dll`) must be put
+   in the same folder as the main `mimalloc-override.dll` at runtime (as it is a dependency of that DLL).
+   The redirection DLL ensures that all calls to the C runtime malloc API get redirected to
+   mimalloc functions (which reside in `mimalloc-override.dll`).
+4. Ensure the `mimalloc-override.dll` comes as early as possible in the import
+   list of the final executable (so it can intercept all potential allocations).
+
+For best performance on Windows with C++, it
 is also recommended to also override the `new`/`delete` operations (by including
-[`mimalloc-new-delete.h`](https://github.com/microsoft/mimalloc/blob/master/include/mimalloc-new-delete.h) a single(!) source file in your project).
+[`mimalloc-new-delete.h`](include/mimalloc-new-delete.h) 
+a single(!) source file in your project).
 
 The environment variable `MIMALLOC_DISABLE_REDIRECT=1` can be used to disable dynamic
 overriding at run-time. Use `MIMALLOC_VERBOSE=1` to check if mimalloc was successfully redirected.
 
-(Note: in principle, it is possible to even patch existing executables without any recompilation
+We cannot always re-link an executable with `mimalloc-override.dll`, and similarly, we cannot always
+ensure the the DLL comes first in the import table of the final executable.
+In many cases though we can patch existing executables without any recompilation
 if they are linked with the dynamic C runtime (`ucrtbase.dll`) -- just put the `mimalloc-override.dll`
 into the import table (and put `mimalloc-redirect.dll` in the same folder)
-Such patching can be done for example with [CFF Explorer](https://ntcore.com/?page_id=388)).
-
+Such patching can be done for example with [CFF Explorer](https://ntcore.com/?page_id=388) or
+the [`minject`](bin) program.
 
 ## Static override
 
 On Unix-like systems, you can also statically link with _mimalloc_ to override the standard
 malloc interface. The recommended way is to link the final program with the
-_mimalloc_ single object file (`mimalloc-override.o`). We use
+_mimalloc_ single object file (`mimalloc.o`). We use
 an object file instead of a library file as linkers give preference to
 that over archives to resolve symbols. To ensure that the standard
 malloc interface resolves to the _mimalloc_ library, link it as the first
 object file. For example:
 ```
-> gcc -o myprogram mimalloc-override.o  myfile1.c ...
+> gcc -o myprogram mimalloc.o  myfile1.c ...
 ```
 
 Another way to override statically that works on all platforms, is to
@@ -427,6 +473,96 @@ This is provided by [`mimalloc-override.h`](https://github.com/microsoft/mimallo
 under your control or otherwise mixing of pointers from different heaps may occur!
 
 
+# Tools
+
+Generally, we recommend using the standard allocator with memory tracking tools, but mimalloc
+can also be build to support the [address sanitizer][asan] or the excellent [Valgrind] tool. 
+Moreover, it can be build to support Windows event tracing ([ETW]).
+This has a small performance overhead but does allow detecting memory leaks and byte-precise 
+buffer overflows directly on final executables. See also the `test/test-wrong.c` file to test with various tools.
+
+## Valgrind
+
+To build with [valgrind] support, use the `MI_TRACK_VALGRIND=ON` cmake option:
+
+```
+> cmake ../.. -DMI_TRACK_VALGRIND=ON
+```
+
+This can also be combined with secure mode or debug mode.
+You can then run your programs directly under valgrind:
+
+```
+> valgrind <myprogram>
+```
+
+If you rely on overriding `malloc`/`free` by mimalloc (instead of using the `mi_malloc`/`mi_free` API directly),
+you also need to tell `valgrind` to not intercept those calls itself, and use:
+
+```
+> MIMALLOC_SHOW_STATS=1 valgrind  --soname-synonyms=somalloc=*mimalloc* -- <myprogram>
+```
+
+By setting the `MIMALLOC_SHOW_STATS` environment variable you can check that mimalloc is indeed
+used and not the standard allocator. Even though the [Valgrind option][valgrind-soname]
+is called `--soname-synonyms`, this also
+works when overriding with a static library or object file. Unfortunately, it is not possible to
+dynamically override mimalloc using `LD_PRELOAD` together with `valgrind`.
+See also the `test/test-wrong.c` file to test with `valgrind`.
+
+Valgrind support is in its initial development -- please report any issues.
+
+[Valgrind]: https://valgrind.org/
+[valgrind-soname]: https://valgrind.org/docs/manual/manual-core.html#opt.soname-synonyms
+
+## ASAN
+
+To build with the address sanitizer, use the `-DMI_TRACK_ASAN=ON` cmake option:
+
+```
+> cmake ../.. -DMI_TRACK_ASAN=ON
+```
+
+This can also be combined with secure mode or debug mode. 
+You can then run your programs as:'
+
+```
+> ASAN_OPTIONS=verbosity=1 <myprogram>
+```
+
+When you link a program with an address sanitizer build of mimalloc, you should
+generally compile that program too with the address sanitizer enabled. 
+For example, assuming you build mimalloc in `out/debug`:
+
+```
+clang -g -o test-wrong -Iinclude test/test-wrong.c out/debug/libmimalloc-asan-debug.a -lpthread -fsanitize=address -fsanitize-recover=address
+```
+
+Since the address sanitizer redirects the standard allocation functions, on some platforms (macOSX for example)
+it is required to compile mimalloc with `-DMI_OVERRIDE=OFF`.
+Adress sanitizer support is in its initial development -- please report any issues.
+
+[asan]: https://github.com/google/sanitizers/wiki/AddressSanitizer
+
+## ETW
+
+Event tracing for Windows ([ETW]) provides a high performance way to capture all allocations though
+mimalloc and analyze them later. To build with ETW support, use the `-DMI_TRACK_ETW=ON` cmake option. 
+
+You can then capture an allocation trace using the Windows performance recorder (WPR), using the 
+`src/prim/windows/etw-mimalloc.wprp` profile. In an admin prompt, you can use:
+```
+> wpr -start src\prim\windows\etw-mimalloc.wprp -filemode
+> <my_mimalloc_program>
+> wpr -stop <my_mimalloc_program>.etl
+``` 
+and then open `<my_mimalloc_program>.etl` in the Windows Performance Analyzer (WPA), or 
+use a tool like [TraceControl] that is specialized for analyzing mimalloc traces.
+
+[ETW]: https://learn.microsoft.com/en-us/windows-hardware/test/wpt/event-tracing-for-windows
+[TraceControl]: https://github.com/xinglonghe/TraceControl
+
+
 # Performance
 
 Last update: 2021-01-30
@@ -532,7 +668,7 @@ The _alloc-test_, by
 [OLogN Technologies AG](http://ithare.com/testing-memory-allocators-ptmalloc2-tcmalloc-hoard-jemalloc-while-trying-to-simulate-real-world-loads/), is a very allocation intensive benchmark doing millions of
 allocations in various size classes. The test is scaled such that when an
 allocator performs almost identically on _alloc-test1_ as _alloc-testN_ it
-means that it scales linearly. 
+means that it scales linearly.
 
 The _sh6bench_ and _sh8bench_ benchmarks are
 developed by [MicroQuill](http://www.microquill.com/) as part of SmartHeap.
@@ -636,6 +772,7 @@ see the differences in the _larsonN_, _mstressN_, and _xmalloc-testN_ benchmarks
 
 -->
 
+
 # References
 
 - \[1] Emery D. Berger, Kathryn S. McKinley, Robert D. Blumofe, and Paul R. Wilson.
@@ -673,7 +810,6 @@ see the differences in the _larsonN_, _mstressN_, and _xmalloc-testN_ benchmarks
   In Proceedings of the 2019 ACM SIGPLAN International Symposium on Memory Management, 122–135. ACM. 2019.
 -->
 
-
 # Contributing
 
 This project welcomes contributions and suggestions.  Most contributions require you to agree to a
@@ -683,3 +819,44 @@ the rights to use your contribution. For details, visit https://cla.microsoft.co
 When you submit a pull request, a CLA-bot will automatically determine whether you need to provide
 a CLA and decorate the PR appropriately (e.g., label, comment). Simply follow the instructions
 provided by the bot. You will only need to do this once across all repos using our CLA.
+
+
+# Older Release Notes
+
+* 2021-11-14, `v1.7.3`, `v2.0.3` (beta): improved WASM support, improved macOS support and performance (including
+  M1), improved performance for v2 for large objects, Python integration improvements, more standard
+  installation directories, various small fixes.
+* 2021-06-17, `v1.7.2`, `v2.0.2` (beta): support M1, better installation layout on Linux, fix
+  thread_id on Android, prefer 2-6TiB area for aligned allocation to work better on pre-windows 8, various small fixes.
+* 2021-04-06, `v1.7.1`, `v2.0.1` (beta): fix bug in arena allocation for huge pages, improved aslr on large allocations, initial M1 support (still experimental).
+* 2021-01-31, `v2.0.0`: beta release 2.0: new slice algorithm for managing internal mimalloc pages.
+* 2021-01-31, `v1.7.0`: stable release 1.7: support explicit user provided memory regions, more precise statistics,
+  improve macOS overriding, initial support for Apple M1, improved DragonFly support, faster memcpy on Windows, various small fixes.
+
+* 2020-09-24, `v1.6.7`: stable release 1.6: using standard C atomics, passing tsan testing, improved
+  handling of failing to commit on Windows, add [`mi_process_info`](https://github.com/microsoft/mimalloc/blob/master/include/mimalloc.h#L156) api call.
+* 2020-08-06, `v1.6.4`: stable release 1.6: improved error recovery in low-memory situations,
+  support for IllumOS and Haiku, NUMA support for Vista/XP, improved NUMA detection for AMD Ryzen, ubsan support.
+* 2020-05-05, `v1.6.3`: stable release 1.6: improved behavior in out-of-memory situations, improved malloc zones on macOS,
+  build PIC static libraries by default, add option to abort on out-of-memory, line buffered statistics.
+* 2020-04-20, `v1.6.2`: stable release 1.6: fix compilation on Android, MingW, Raspberry, and Conda,
+  stability fix for Windows 7, fix multiple mimalloc instances in one executable, fix `strnlen` overload,
+  fix aligned debug padding.
+* 2020-02-17, `v1.6.1`: stable release 1.6: minor updates (build with clang-cl, fix alignment issue for small objects).
+* 2020-02-09, `v1.6.0`: stable release 1.6: fixed potential memory leak, improved overriding
+  and thread local support on FreeBSD, NetBSD, DragonFly, and macOSX. New byte-precise
+  heap block overflow detection in debug mode (besides the double-free detection and free-list
+  corruption detection). Add `nodiscard` attribute to most allocation functions.
+  Enable `MIMALLOC_PAGE_RESET` by default. New reclamation strategy for abandoned heap pages
+  for better memory footprint.
+* 2020-02-09, `v1.5.0`: stable release 1.5: improved free performance, small bug fixes.
+* 2020-01-22, `v1.4.0`: stable release 1.4: improved performance for delayed OS page reset,
+more eager concurrent free, addition of STL allocator, fixed potential memory leak.
+* 2020-01-15, `v1.3.0`: stable release 1.3: bug fixes, improved randomness and [stronger
+free list encoding](https://github.com/microsoft/mimalloc/blob/783e3377f79ee82af43a0793910a9f2d01ac7863/include/mimalloc-internal.h#L396) in secure mode.
+
+* 2019-12-22, `v1.2.2`: stable release 1.2: minor updates.
+* 2019-11-22, `v1.2.0`: stable release 1.2: bug fixes, improved secure mode (free list corruption checks, double free mitigation). Improved dynamic overriding on Windows.
+* 2019-10-07, `v1.1.0`: stable release 1.1.
+* 2019-09-01, `v1.0.8`: pre-release 8: more robust windows dynamic overriding, initial huge page support.
+* 2019-08-10, `v1.0.6`: pre-release 6: various performance improvements.
diff --git a/contrib/libs/mimalloc/src/alloc-aligned.c b/contrib/libs/mimalloc/src/alloc-aligned.c
index 724c0a1bfe31..20c360444925 100644
--- a/contrib/libs/mimalloc/src/alloc-aligned.c
+++ b/contrib/libs/mimalloc/src/alloc-aligned.c
@@ -6,113 +6,222 @@ terms of the MIT license. A copy of the license can be found in the file
 -----------------------------------------------------------------------------*/
 
 #include "mimalloc.h"
-#include "mimalloc-internal.h"
+#include "mimalloc/internal.h"
+#include "mimalloc/prim.h"  // mi_prim_get_default_heap
 
-#include <string.h>  // memset
+#include <string.h>     // memset
 
 // ------------------------------------------------------
 // Aligned Allocation
 // ------------------------------------------------------
 
-static void* mi_heap_malloc_zero_aligned_at(mi_heap_t* const heap, const size_t size, const size_t alignment, const size_t offset, const bool zero) mi_attr_noexcept {
-  // note: we don't require `size > offset`, we just guarantee that
-  // the address at offset is aligned regardless of the allocated size.
-  mi_assert(alignment > 0);
-  if (mi_unlikely(size > PTRDIFF_MAX)) return NULL;   // we don't allocate more than PTRDIFF_MAX (see <https://sourceware.org/ml/libc-announce/2019/msg00001.html>)
-  if (mi_unlikely(alignment==0 || !_mi_is_power_of_two(alignment))) return NULL; // require power-of-two (see <https://en.cppreference.com/w/c/memory/aligned_alloc>)
-  const uintptr_t align_mask = alignment-1;  // for any x, `(x & align_mask) == (x % alignment)`
-  
-  // try if there is a small block available with just the right alignment
-  const size_t padsize = size + MI_PADDING_SIZE;
-  if (mi_likely(padsize <= MI_SMALL_SIZE_MAX)) {
-    mi_page_t* page = _mi_heap_get_free_small_page(heap,padsize);
-    const bool is_aligned = (((uintptr_t)page->free+offset) & align_mask)==0;
-    if (mi_likely(page->free != NULL && is_aligned))
-    {
-      #if MI_STAT>1
-      mi_heap_stat_increase( heap, malloc, size);
+static bool mi_malloc_is_naturally_aligned( size_t size, size_t alignment ) {
+  // objects up to `MI_MAX_ALIGN_GUARANTEE` are allocated aligned to their size (see `segment.c:_mi_segment_page_start`).
+  mi_assert_internal(_mi_is_power_of_two(alignment) && (alignment > 0));
+  if (alignment > size) return false;
+  if (alignment <= MI_MAX_ALIGN_SIZE) return true;
+  const size_t bsize = mi_good_size(size);
+  return (bsize <= MI_MAX_ALIGN_GUARANTEE && (bsize & (alignment-1)) == 0);
+}
+
+// Fallback aligned allocation that over-allocates -- split out for better codegen
+static mi_decl_noinline void* mi_heap_malloc_zero_aligned_at_overalloc(mi_heap_t* const heap, const size_t size, const size_t alignment, const size_t offset, const bool zero) mi_attr_noexcept
+{
+  mi_assert_internal(size <= (MI_MAX_ALLOC_SIZE - MI_PADDING_SIZE));
+  mi_assert_internal(alignment != 0 && _mi_is_power_of_two(alignment));
+
+  void* p;
+  size_t oversize;
+  if mi_unlikely(alignment > MI_BLOCK_ALIGNMENT_MAX) {
+    // use OS allocation for very large alignment and allocate inside a huge page (dedicated segment with 1 page)
+    // This can support alignments >= MI_SEGMENT_SIZE by ensuring the object can be aligned at a point in the
+    // first (and single) page such that the segment info is `MI_SEGMENT_SIZE` bytes before it (so it can be found by aligning the pointer down)
+    if mi_unlikely(offset != 0) {
+      // todo: cannot support offset alignment for very large alignments yet
+      #if MI_DEBUG > 0
+      _mi_error_message(EOVERFLOW, "aligned allocation with a very large alignment cannot be used with an alignment offset (size %zu, alignment %zu, offset %zu)\n", size, alignment, offset);
       #endif
-      void* p = _mi_page_malloc(heap,page,padsize); // TODO: inline _mi_page_malloc
-      mi_assert_internal(p != NULL);
-      mi_assert_internal(((uintptr_t)p + offset) % alignment == 0);
-      if (zero) _mi_block_zero_init(page,p,size);
-      return p;
+      return NULL;
+    }
+    oversize = (size <= MI_SMALL_SIZE_MAX ? MI_SMALL_SIZE_MAX + 1 /* ensure we use generic malloc path */ : size);
+    p = _mi_heap_malloc_zero_ex(heap, oversize, false, alignment); // the page block size should be large enough to align in the single huge page block
+    // zero afterwards as only the area from the aligned_p may be committed!
+    if (p == NULL) return NULL;
+  }
+  else {
+    // otherwise over-allocate
+    oversize = size + alignment - 1;
+    p = _mi_heap_malloc_zero(heap, oversize, zero);
+    if (p == NULL) return NULL;
+  }
+
+  // .. and align within the allocation
+  const uintptr_t align_mask = alignment - 1;  // for any x, `(x & align_mask) == (x % alignment)`
+  const uintptr_t poffset = ((uintptr_t)p + offset) & align_mask;
+  const uintptr_t adjust  = (poffset == 0 ? 0 : alignment - poffset);
+  mi_assert_internal(adjust < alignment);
+  void* aligned_p = (void*)((uintptr_t)p + adjust);
+  if (aligned_p != p) {
+    mi_page_t* page = _mi_ptr_page(p);
+    mi_page_set_has_aligned(page, true);
+    _mi_padding_shrink(page, (mi_block_t*)p, adjust + size);
+  }
+  // todo: expand padding if overallocated ?
+
+  mi_assert_internal(mi_page_usable_block_size(_mi_ptr_page(p)) >= adjust + size);
+  mi_assert_internal(p == _mi_page_ptr_unalign(_mi_ptr_page(aligned_p), aligned_p));
+  mi_assert_internal(((uintptr_t)aligned_p + offset) % alignment == 0);
+  mi_assert_internal(mi_usable_size(aligned_p)>=size);
+  mi_assert_internal(mi_usable_size(p) == mi_usable_size(aligned_p)+adjust);
+
+  // now zero the block if needed
+  if (alignment > MI_BLOCK_ALIGNMENT_MAX) {
+    // for the tracker, on huge aligned allocations only from the start of the large block is defined
+    mi_track_mem_undefined(aligned_p, size);
+    if (zero) {
+      _mi_memzero_aligned(aligned_p, mi_usable_size(aligned_p));
     }
   }
 
-  // use regular allocation if it is guaranteed to fit the alignment constraints
-  if (offset==0 && alignment<=padsize && padsize<=MI_MEDIUM_OBJ_SIZE_MAX && (padsize&align_mask)==0) {
+  if (p != aligned_p) {
+    mi_track_align(p,aligned_p,adjust,mi_usable_size(aligned_p));
+  }
+  return aligned_p;
+}
+
+// Generic primitive aligned allocation -- split out for better codegen
+static mi_decl_noinline void* mi_heap_malloc_zero_aligned_at_generic(mi_heap_t* const heap, const size_t size, const size_t alignment, const size_t offset, const bool zero) mi_attr_noexcept
+{
+  mi_assert_internal(alignment != 0 && _mi_is_power_of_two(alignment));
+  // we don't allocate more than MI_MAX_ALLOC_SIZE (see <https://sourceware.org/ml/libc-announce/2019/msg00001.html>)
+  if mi_unlikely(size > (MI_MAX_ALLOC_SIZE - MI_PADDING_SIZE)) { 
+    #if MI_DEBUG > 0
+    _mi_error_message(EOVERFLOW, "aligned allocation request is too large (size %zu, alignment %zu)\n", size, alignment);
+    #endif
+    return NULL;
+  }
+  
+  // use regular allocation if it is guaranteed to fit the alignment constraints.
+  // this is important to try as the fast path in `mi_heap_malloc_zero_aligned` only works when there exist
+  // a page with the right block size, and if we always use the over-alloc fallback that would never happen.
+  if (offset == 0 && mi_malloc_is_naturally_aligned(size,alignment)) {
     void* p = _mi_heap_malloc_zero(heap, size, zero);
     mi_assert_internal(p == NULL || ((uintptr_t)p % alignment) == 0);
-    return p;
+    const bool is_aligned_or_null = (((uintptr_t)p) & (alignment-1))==0;  
+    if mi_likely(is_aligned_or_null) {
+      return p;
+    }
+    else {
+      // this should never happen if the `mi_malloc_is_naturally_aligned` check is correct..
+      mi_assert(false);
+      mi_free(p); 
+    }
+  }
+
+  // fall back to over-allocation
+  return mi_heap_malloc_zero_aligned_at_overalloc(heap,size,alignment,offset,zero);
+}
+
+// Primitive aligned allocation
+static void* mi_heap_malloc_zero_aligned_at(mi_heap_t* const heap, const size_t size, const size_t alignment, const size_t offset, const bool zero) mi_attr_noexcept
+{
+  // note: we don't require `size > offset`, we just guarantee that the address at offset is aligned regardless of the allocated size.
+  if mi_unlikely(alignment == 0 || !_mi_is_power_of_two(alignment)) { // require power-of-two (see <https://en.cppreference.com/w/c/memory/aligned_alloc>)
+    #if MI_DEBUG > 0
+    _mi_error_message(EOVERFLOW, "aligned allocation requires the alignment to be a power-of-two (size %zu, alignment %zu)\n", size, alignment);
+    #endif
+    return NULL;
   }
   
-  // otherwise over-allocate
-  void* p = _mi_heap_malloc_zero(heap, size + alignment - 1, zero);
-  if (p == NULL) return NULL;
+  // try first if there happens to be a small block available with just the right alignment
+  if mi_likely(size <= MI_SMALL_SIZE_MAX && alignment <= size) {
+    const uintptr_t align_mask = alignment-1;       // for any x, `(x & align_mask) == (x % alignment)`
+    const size_t padsize = size + MI_PADDING_SIZE;  
+    mi_page_t* page = _mi_heap_get_free_small_page(heap, padsize);
+    if mi_likely(page->free != NULL) {
+      const bool is_aligned = (((uintptr_t)page->free + offset) & align_mask)==0;
+      if mi_likely(is_aligned)
+      {
+        #if MI_STAT>1
+        mi_heap_stat_increase(heap, malloc, size);
+        #endif
+        void* p = (zero ? _mi_page_malloc_zeroed(heap,page,padsize) : _mi_page_malloc(heap,page,padsize)); // call specific page malloc for better codegen
+        mi_assert_internal(p != NULL);
+        mi_assert_internal(((uintptr_t)p + offset) % alignment == 0);
+        mi_track_malloc(p,size,zero);
+        return p;
+      }
+    }
+  }
 
-  // .. and align within the allocation
-  uintptr_t adjust = alignment - (((uintptr_t)p + offset) & align_mask);
-  mi_assert_internal(adjust <= alignment);
-  void* aligned_p = (adjust == alignment ? p : (void*)((uintptr_t)p + adjust));
-  if (aligned_p != p) mi_page_set_has_aligned(_mi_ptr_page(p), true); 
-  mi_assert_internal(((uintptr_t)aligned_p + offset) % alignment == 0);
-  mi_assert_internal( p == _mi_page_ptr_unalign(_mi_ptr_segment(aligned_p),_mi_ptr_page(aligned_p),aligned_p) );
-  return aligned_p;
+  // fallback to generic aligned allocation
+  return mi_heap_malloc_zero_aligned_at_generic(heap, size, alignment, offset, zero);
 }
 
 
-mi_decl_restrict void* mi_heap_malloc_aligned_at(mi_heap_t* heap, size_t size, size_t alignment, size_t offset) mi_attr_noexcept {
+// ------------------------------------------------------
+// Optimized mi_heap_malloc_aligned / mi_malloc_aligned
+// ------------------------------------------------------
+
+mi_decl_nodiscard mi_decl_restrict void* mi_heap_malloc_aligned_at(mi_heap_t* heap, size_t size, size_t alignment, size_t offset) mi_attr_noexcept {
   return mi_heap_malloc_zero_aligned_at(heap, size, alignment, offset, false);
 }
 
-mi_decl_restrict void* mi_heap_malloc_aligned(mi_heap_t* heap, size_t size, size_t alignment) mi_attr_noexcept {
+mi_decl_nodiscard mi_decl_restrict void* mi_heap_malloc_aligned(mi_heap_t* heap, size_t size, size_t alignment) mi_attr_noexcept {
   return mi_heap_malloc_aligned_at(heap, size, alignment, 0);
 }
 
-mi_decl_restrict void* mi_heap_zalloc_aligned_at(mi_heap_t* heap, size_t size, size_t alignment, size_t offset) mi_attr_noexcept {
+// ------------------------------------------------------
+// Aligned Allocation
+// ------------------------------------------------------
+
+mi_decl_nodiscard mi_decl_restrict void* mi_heap_zalloc_aligned_at(mi_heap_t* heap, size_t size, size_t alignment, size_t offset) mi_attr_noexcept {
   return mi_heap_malloc_zero_aligned_at(heap, size, alignment, offset, true);
 }
 
-mi_decl_restrict void* mi_heap_zalloc_aligned(mi_heap_t* heap, size_t size, size_t alignment) mi_attr_noexcept {
+mi_decl_nodiscard mi_decl_restrict void* mi_heap_zalloc_aligned(mi_heap_t* heap, size_t size, size_t alignment) mi_attr_noexcept {
   return mi_heap_zalloc_aligned_at(heap, size, alignment, 0);
 }
 
-mi_decl_restrict void* mi_heap_calloc_aligned_at(mi_heap_t* heap, size_t count, size_t size, size_t alignment, size_t offset) mi_attr_noexcept {
+mi_decl_nodiscard mi_decl_restrict void* mi_heap_calloc_aligned_at(mi_heap_t* heap, size_t count, size_t size, size_t alignment, size_t offset) mi_attr_noexcept {
   size_t total;
   if (mi_count_size_overflow(count, size, &total)) return NULL;
   return mi_heap_zalloc_aligned_at(heap, total, alignment, offset);
 }
 
-mi_decl_restrict void* mi_heap_calloc_aligned(mi_heap_t* heap, size_t count, size_t size, size_t alignment) mi_attr_noexcept {
+mi_decl_nodiscard mi_decl_restrict void* mi_heap_calloc_aligned(mi_heap_t* heap, size_t count, size_t size, size_t alignment) mi_attr_noexcept {
   return mi_heap_calloc_aligned_at(heap,count,size,alignment,0);
 }
 
-mi_decl_restrict void* mi_malloc_aligned_at(size_t size, size_t alignment, size_t offset) mi_attr_noexcept {
-  return mi_heap_malloc_aligned_at(mi_get_default_heap(), size, alignment, offset);
+mi_decl_nodiscard mi_decl_restrict void* mi_malloc_aligned_at(size_t size, size_t alignment, size_t offset) mi_attr_noexcept {
+  return mi_heap_malloc_aligned_at(mi_prim_get_default_heap(), size, alignment, offset);
 }
 
-mi_decl_restrict void* mi_malloc_aligned(size_t size, size_t alignment) mi_attr_noexcept {
-  return mi_heap_malloc_aligned(mi_get_default_heap(), size, alignment);
+mi_decl_nodiscard mi_decl_restrict void* mi_malloc_aligned(size_t size, size_t alignment) mi_attr_noexcept {
+  return mi_heap_malloc_aligned(mi_prim_get_default_heap(), size, alignment);
 }
 
-mi_decl_restrict void* mi_zalloc_aligned_at(size_t size, size_t alignment, size_t offset) mi_attr_noexcept {
-  return mi_heap_zalloc_aligned_at(mi_get_default_heap(), size, alignment, offset);
+mi_decl_nodiscard mi_decl_restrict void* mi_zalloc_aligned_at(size_t size, size_t alignment, size_t offset) mi_attr_noexcept {
+  return mi_heap_zalloc_aligned_at(mi_prim_get_default_heap(), size, alignment, offset);
 }
 
-mi_decl_restrict void* mi_zalloc_aligned(size_t size, size_t alignment) mi_attr_noexcept {
-  return mi_heap_zalloc_aligned(mi_get_default_heap(), size, alignment);
+mi_decl_nodiscard mi_decl_restrict void* mi_zalloc_aligned(size_t size, size_t alignment) mi_attr_noexcept {
+  return mi_heap_zalloc_aligned(mi_prim_get_default_heap(), size, alignment);
 }
 
-mi_decl_restrict void* mi_calloc_aligned_at(size_t count, size_t size, size_t alignment, size_t offset) mi_attr_noexcept {
-  return mi_heap_calloc_aligned_at(mi_get_default_heap(), count, size, alignment, offset);
+mi_decl_nodiscard mi_decl_restrict void* mi_calloc_aligned_at(size_t count, size_t size, size_t alignment, size_t offset) mi_attr_noexcept {
+  return mi_heap_calloc_aligned_at(mi_prim_get_default_heap(), count, size, alignment, offset);
 }
 
-mi_decl_restrict void* mi_calloc_aligned(size_t count, size_t size, size_t alignment) mi_attr_noexcept {
-  return mi_heap_calloc_aligned(mi_get_default_heap(), count, size, alignment);
+mi_decl_nodiscard mi_decl_restrict void* mi_calloc_aligned(size_t count, size_t size, size_t alignment) mi_attr_noexcept {
+  return mi_heap_calloc_aligned(mi_prim_get_default_heap(), count, size, alignment);
 }
 
 
+// ------------------------------------------------------
+// Aligned re-allocation
+// ------------------------------------------------------
+
 static void* mi_heap_realloc_zero_aligned_at(mi_heap_t* heap, void* p, size_t newsize, size_t alignment, size_t offset, bool zero) mi_attr_noexcept {
   mi_assert(alignment > 0);
   if (alignment <= sizeof(uintptr_t)) return _mi_heap_realloc_zero(heap,p,newsize,zero);
@@ -123,19 +232,13 @@ static void* mi_heap_realloc_zero_aligned_at(mi_heap_t* heap, void* p, size_t ne
     return p;  // reallocation still fits, is aligned and not more than 50% waste
   }
   else {
+    // note: we don't zero allocate upfront so we only zero initialize the expanded part
     void* newp = mi_heap_malloc_aligned_at(heap,newsize,alignment,offset);
     if (newp != NULL) {
       if (zero && newsize > size) {
-        const mi_page_t* page = _mi_ptr_page(newp);
-        if (page->is_zero) {
-          // already zero initialized
-          mi_assert_expensive(mi_mem_is_zero(newp,newsize));
-        }
-        else {
-          // also set last word in the previous allocation to zero to ensure any padding is zero-initialized
-          size_t start = (size >= sizeof(intptr_t) ? size - sizeof(intptr_t) : 0);
-          memset((uint8_t*)newp + start, 0, newsize - start);
-        }
+        // also set last word in the previous allocation to zero to ensure any padding is zero-initialized
+        size_t start = (size >= sizeof(intptr_t) ? size - sizeof(intptr_t) : 0);
+        _mi_memzero((uint8_t*)newp + start, newsize - start);
       }
       _mi_memcpy_aligned(newp, p, (newsize > size ? size : newsize));
       mi_free(p); // only free if successful
@@ -151,55 +254,54 @@ static void* mi_heap_realloc_zero_aligned(mi_heap_t* heap, void* p, size_t newsi
   return mi_heap_realloc_zero_aligned_at(heap,p,newsize,alignment,offset,zero);
 }
 
-void* mi_heap_realloc_aligned_at(mi_heap_t* heap, void* p, size_t newsize, size_t alignment, size_t offset) mi_attr_noexcept {
+mi_decl_nodiscard void* mi_heap_realloc_aligned_at(mi_heap_t* heap, void* p, size_t newsize, size_t alignment, size_t offset) mi_attr_noexcept {
   return mi_heap_realloc_zero_aligned_at(heap,p,newsize,alignment,offset,false);
 }
 
-void* mi_heap_realloc_aligned(mi_heap_t* heap, void* p, size_t newsize, size_t alignment) mi_attr_noexcept {
+mi_decl_nodiscard void* mi_heap_realloc_aligned(mi_heap_t* heap, void* p, size_t newsize, size_t alignment) mi_attr_noexcept {
   return mi_heap_realloc_zero_aligned(heap,p,newsize,alignment,false);
 }
 
-void* mi_heap_rezalloc_aligned_at(mi_heap_t* heap, void* p, size_t newsize, size_t alignment, size_t offset) mi_attr_noexcept {
+mi_decl_nodiscard void* mi_heap_rezalloc_aligned_at(mi_heap_t* heap, void* p, size_t newsize, size_t alignment, size_t offset) mi_attr_noexcept {
   return mi_heap_realloc_zero_aligned_at(heap, p, newsize, alignment, offset, true);
 }
 
-void* mi_heap_rezalloc_aligned(mi_heap_t* heap, void* p, size_t newsize, size_t alignment) mi_attr_noexcept {
+mi_decl_nodiscard void* mi_heap_rezalloc_aligned(mi_heap_t* heap, void* p, size_t newsize, size_t alignment) mi_attr_noexcept {
   return mi_heap_realloc_zero_aligned(heap, p, newsize, alignment, true);
 }
 
-void* mi_heap_recalloc_aligned_at(mi_heap_t* heap, void* p, size_t newcount, size_t size, size_t alignment, size_t offset) mi_attr_noexcept {
+mi_decl_nodiscard void* mi_heap_recalloc_aligned_at(mi_heap_t* heap, void* p, size_t newcount, size_t size, size_t alignment, size_t offset) mi_attr_noexcept {
   size_t total;
   if (mi_count_size_overflow(newcount, size, &total)) return NULL;
   return mi_heap_rezalloc_aligned_at(heap, p, total, alignment, offset);
 }
 
-void* mi_heap_recalloc_aligned(mi_heap_t* heap, void* p, size_t newcount, size_t size, size_t alignment) mi_attr_noexcept {
+mi_decl_nodiscard void* mi_heap_recalloc_aligned(mi_heap_t* heap, void* p, size_t newcount, size_t size, size_t alignment) mi_attr_noexcept {
   size_t total;
   if (mi_count_size_overflow(newcount, size, &total)) return NULL;
   return mi_heap_rezalloc_aligned(heap, p, total, alignment);
 }
 
-void* mi_realloc_aligned_at(void* p, size_t newsize, size_t alignment, size_t offset) mi_attr_noexcept {
-  return mi_heap_realloc_aligned_at(mi_get_default_heap(), p, newsize, alignment, offset);
+mi_decl_nodiscard void* mi_realloc_aligned_at(void* p, size_t newsize, size_t alignment, size_t offset) mi_attr_noexcept {
+  return mi_heap_realloc_aligned_at(mi_prim_get_default_heap(), p, newsize, alignment, offset);
 }
 
-void* mi_realloc_aligned(void* p, size_t newsize, size_t alignment) mi_attr_noexcept {
-  return mi_heap_realloc_aligned(mi_get_default_heap(), p, newsize, alignment);
+mi_decl_nodiscard void* mi_realloc_aligned(void* p, size_t newsize, size_t alignment) mi_attr_noexcept {
+  return mi_heap_realloc_aligned(mi_prim_get_default_heap(), p, newsize, alignment);
 }
 
-void* mi_rezalloc_aligned_at(void* p, size_t newsize, size_t alignment, size_t offset) mi_attr_noexcept {
-  return mi_heap_rezalloc_aligned_at(mi_get_default_heap(), p, newsize, alignment, offset);
+mi_decl_nodiscard void* mi_rezalloc_aligned_at(void* p, size_t newsize, size_t alignment, size_t offset) mi_attr_noexcept {
+  return mi_heap_rezalloc_aligned_at(mi_prim_get_default_heap(), p, newsize, alignment, offset);
 }
 
-void* mi_rezalloc_aligned(void* p, size_t newsize, size_t alignment) mi_attr_noexcept {
-  return mi_heap_rezalloc_aligned(mi_get_default_heap(), p, newsize, alignment);
+mi_decl_nodiscard void* mi_rezalloc_aligned(void* p, size_t newsize, size_t alignment) mi_attr_noexcept {
+  return mi_heap_rezalloc_aligned(mi_prim_get_default_heap(), p, newsize, alignment);
 }
 
-void* mi_recalloc_aligned_at(void* p, size_t newcount, size_t size, size_t alignment, size_t offset) mi_attr_noexcept {
-  return mi_heap_recalloc_aligned_at(mi_get_default_heap(), p, newcount, size, alignment, offset);
+mi_decl_nodiscard void* mi_recalloc_aligned_at(void* p, size_t newcount, size_t size, size_t alignment, size_t offset) mi_attr_noexcept {
+  return mi_heap_recalloc_aligned_at(mi_prim_get_default_heap(), p, newcount, size, alignment, offset);
 }
 
-void* mi_recalloc_aligned(void* p, size_t newcount, size_t size, size_t alignment) mi_attr_noexcept {
-  return mi_heap_recalloc_aligned(mi_get_default_heap(), p, newcount, size, alignment);
+mi_decl_nodiscard void* mi_recalloc_aligned(void* p, size_t newcount, size_t size, size_t alignment) mi_attr_noexcept {
+  return mi_heap_recalloc_aligned(mi_prim_get_default_heap(), p, newcount, size, alignment);
 }
-
diff --git a/contrib/libs/mimalloc/src/alloc-override-osx.c b/contrib/libs/mimalloc/src/alloc-override-osx.c
deleted file mode 100644
index f506d30a9565..000000000000
--- a/contrib/libs/mimalloc/src/alloc-override-osx.c
+++ /dev/null
@@ -1,281 +0,0 @@
-/* ----------------------------------------------------------------------------
-Copyright (c) 2018-2020, Microsoft Research, Daan Leijen
-This is free software; you can redistribute it and/or modify it under the
-terms of the MIT license. A copy of the license can be found in the file
-"LICENSE" at the root of this distribution.
------------------------------------------------------------------------------*/
-
-#include "mimalloc.h"
-#include "mimalloc-internal.h"
-
-#if defined(MI_MALLOC_OVERRIDE)
-
-#if !defined(__APPLE__)
-#error "this file should only be included on macOS"
-#endif
-
-/* ------------------------------------------------------
-   Override system malloc on macOS
-   This is done through the malloc zone interface.
-   It seems we also need to interpose (see `alloc-override.c`)
-   or otherwise we get zone errors as there are usually 
-   already allocations done by the time we take over the 
-   zone. Unfortunately, that means we need to replace
-   the `free` with a checked free (`cfree`) impacting 
-   performance.
------------------------------------------------------- */
-
-#include <AvailabilityMacros.h>
-#include <malloc/malloc.h>
-#include <string.h>  // memset
-
-#if defined(MAC_OS_X_VERSION_10_6) && \
-    MAC_OS_X_VERSION_MAX_ALLOWED >= MAC_OS_X_VERSION_10_6
-// only available from OSX 10.6
-extern malloc_zone_t* malloc_default_purgeable_zone(void) __attribute__((weak_import));
-#endif
-
-/* ------------------------------------------------------
-   malloc zone members
------------------------------------------------------- */
-
-static size_t zone_size(malloc_zone_t* zone, const void* p) {
-  UNUSED(zone);
-  if (!mi_is_in_heap_region(p))
-    return 0; // not our pointer, bail out
-
-  return mi_usable_size(p);
-}
-
-static void* zone_malloc(malloc_zone_t* zone, size_t size) {
-  UNUSED(zone);
-  return mi_malloc(size);
-}
-
-static void* zone_calloc(malloc_zone_t* zone, size_t count, size_t size) {
-  UNUSED(zone);
-  return mi_calloc(count, size);
-}
-
-static void* zone_valloc(malloc_zone_t* zone, size_t size) {
-  UNUSED(zone);
-  return mi_malloc_aligned(size, _mi_os_page_size());
-}
-
-static void zone_free(malloc_zone_t* zone, void* p) {
-  UNUSED(zone);
-  mi_free(p);
-}
-
-static void* zone_realloc(malloc_zone_t* zone, void* p, size_t newsize) {
-  UNUSED(zone);
-  return mi_realloc(p, newsize);
-}
-
-static void* zone_memalign(malloc_zone_t* zone, size_t alignment, size_t size) {
-  UNUSED(zone);
-  return mi_malloc_aligned(size,alignment);
-}
-
-static void zone_destroy(malloc_zone_t* zone) {
-  UNUSED(zone);
-  // todo: ignore for now?
-}
-
-static unsigned zone_batch_malloc(malloc_zone_t* zone, size_t size, void** ps, unsigned count) {
-  size_t i;
-  for (i = 0; i < count; i++) {
-    ps[i] = zone_malloc(zone, size);
-    if (ps[i] == NULL) break;
-  }
-  return i;
-}
-
-static void zone_batch_free(malloc_zone_t* zone, void** ps, unsigned count) {
-  for(size_t i = 0; i < count; i++) {
-    zone_free(zone, ps[i]);
-    ps[i] = NULL;
-  }
-}
-
-static size_t zone_pressure_relief(malloc_zone_t* zone, size_t size) {
-  UNUSED(zone); UNUSED(size);
-  mi_collect(false);
-  return 0;
-}
-
-static void zone_free_definite_size(malloc_zone_t* zone, void* p, size_t size) {
-  UNUSED(size);
-  zone_free(zone,p);
-}
-
-
-/* ------------------------------------------------------
-   Introspection members
------------------------------------------------------- */
-
-static kern_return_t intro_enumerator(task_t task, void* p,
-                            unsigned type_mask, vm_address_t zone_address,
-                            memory_reader_t reader,
-                            vm_range_recorder_t recorder)
-{
-  // todo: enumerate all memory
-  UNUSED(task); UNUSED(p); UNUSED(type_mask); UNUSED(zone_address);
-  UNUSED(reader); UNUSED(recorder);
-  return KERN_SUCCESS;
-}
-
-static size_t intro_good_size(malloc_zone_t* zone, size_t size) {
-  UNUSED(zone);
-  return mi_good_size(size);
-}
-
-static boolean_t intro_check(malloc_zone_t* zone) {
-  UNUSED(zone);
-  return true;
-}
-
-static void intro_print(malloc_zone_t* zone, boolean_t verbose) {
-  UNUSED(zone); UNUSED(verbose);
-  mi_stats_print(NULL);
-}
-
-static void intro_log(malloc_zone_t* zone, void* p) {
-  UNUSED(zone); UNUSED(p);
-  // todo?
-}
-
-static void intro_force_lock(malloc_zone_t* zone) {
-  UNUSED(zone);
-  // todo?
-}
-
-static void intro_force_unlock(malloc_zone_t* zone) {
-  UNUSED(zone);
-  // todo?
-}
-
-static void intro_statistics(malloc_zone_t* zone, malloc_statistics_t* stats) {
-  UNUSED(zone);
-  // todo...
-  stats->blocks_in_use = 0;
-  stats->size_in_use = 0;
-  stats->max_size_in_use = 0;
-  stats->size_allocated = 0;
-}
-
-static boolean_t intro_zone_locked(malloc_zone_t* zone) {
-  UNUSED(zone);
-  return false;
-}
-
-
-/* ------------------------------------------------------
-  At process start, override the default allocator
------------------------------------------------------- */
-
-static malloc_zone_t* mi_get_default_zone()
-{
-  // The first returned zone is the real default
-  malloc_zone_t** zones = NULL;
-  unsigned count = 0;
-  kern_return_t ret = malloc_get_all_zones(0, NULL, (vm_address_t**)&zones, &count);
-  if (ret == KERN_SUCCESS && count > 0) {
-    return zones[0];
-  }
-  else {
-    // fallback
-    return malloc_default_zone();
-  }
-}
-
-static malloc_introspection_t mi_introspect = {
-  .enumerator = &intro_enumerator,
-  .good_size = &intro_good_size,
-  .check = &intro_check,
-  .print = &intro_print,
-  .log = &intro_log,
-  .force_lock = &intro_force_lock,
-  .force_unlock = &intro_force_unlock,
-#if defined(MAC_OS_X_VERSION_10_6) && \
-    MAC_OS_X_VERSION_MAX_ALLOWED >= MAC_OS_X_VERSION_10_6
-  .zone_locked = &intro_zone_locked,
-  .statistics = &intro_statistics,
-#endif
-};
-
-static malloc_zone_t mi_malloc_zone = {
-  .size = &zone_size,
-  .zone_name = "mimalloc",
-  .introspect = &mi_introspect,
-  .malloc = &zone_malloc,
-  .calloc = &zone_calloc,
-  .valloc = &zone_valloc,
-  .free = &zone_free,
-  .realloc = &zone_realloc,
-  .destroy = &zone_destroy,
-  .batch_malloc = &zone_batch_malloc,
-  .batch_free = &zone_batch_free,
-#if defined(MAC_OS_X_VERSION_10_6) && \
-    MAC_OS_X_VERSION_MAX_ALLOWED >= MAC_OS_X_VERSION_10_6
-  // switch to version 9 on OSX 10.6 to support memalign.
-  .version = 9,
-  .memalign = &zone_memalign,
-  .free_definite_size = &zone_free_definite_size,
-  .pressure_relief = &zone_pressure_relief,
-#else
-  .version = 4,
-#endif
-};
-
-
-#if defined(MI_SHARED_LIB_EXPORT) && defined(MI_INTERPOSE)
-
-static malloc_zone_t *mi_malloc_default_zone(void) {
-  return &mi_malloc_zone;
-}
-// TODO: should use the macros in alloc-override but they aren't available here.
-__attribute__((used)) static struct {
-  const void *replacement;
-  const void *target;
-} replace_malloc_default_zone[] __attribute__((section("__DATA, __interpose"))) = {
-  { (const void*)mi_malloc_default_zone, (const void*)malloc_default_zone },
-};
-#endif
-
-static void __attribute__((constructor(0))) _mi_macos_override_malloc() {
-  malloc_zone_t* purgeable_zone = NULL;
-
-#if defined(MAC_OS_X_VERSION_10_6) && \
-    MAC_OS_X_VERSION_MAX_ALLOWED >= MAC_OS_X_VERSION_10_6
-  // force the purgeable zone to exist to avoid strange bugs
-  if (malloc_default_purgeable_zone) {
-    purgeable_zone = malloc_default_purgeable_zone();
-  }
-#endif
-
-  // Register our zone.
-  // thomcc: I think this is still needed to put us in the zone list.
-  malloc_zone_register(&mi_malloc_zone);
-  // Unregister the default zone, this makes our zone the new default
-  // as that was the last registered.
-  malloc_zone_t *default_zone = mi_get_default_zone();
-  // thomcc: Unsure if the next test is *always* false or just false in the
-  // cases I've tried. I'm also unsure if the code inside is needed. at all
-  if (default_zone != &mi_malloc_zone) {
-    malloc_zone_unregister(default_zone);
-
-    // Reregister the default zone so free and realloc in that zone keep working.
-    malloc_zone_register(default_zone);
-  }
-
-  // Unregister, and re-register the purgeable_zone to avoid bugs if it occurs
-  // earlier than the default zone.
-  if (purgeable_zone != NULL) {
-    malloc_zone_unregister(purgeable_zone);
-    malloc_zone_register(purgeable_zone);
-  }
-
-}
-
-#endif // MI_MALLOC_OVERRIDE
diff --git a/contrib/libs/mimalloc/src/alloc-override.c b/contrib/libs/mimalloc/src/alloc-override.c
index 6a87e7bd2d90..12837cdd9455 100644
--- a/contrib/libs/mimalloc/src/alloc-override.c
+++ b/contrib/libs/mimalloc/src/alloc-override.c
@@ -13,15 +13,26 @@ terms of the MIT license. A copy of the license can be found in the file
 #error "It is only possible to override "malloc" on Windows when building as a DLL (and linking the C runtime as a DLL)"
 #endif
 
-#if defined(MI_MALLOC_OVERRIDE) && !(defined(_WIN32)) // || (defined(__APPLE__) && !defined(MI_INTERPOSE)))
+#if defined(MI_MALLOC_OVERRIDE) && !(defined(_WIN32))
+
+#if defined(__APPLE__)
+#include <AvailabilityMacros.h>
+mi_decl_externc void   vfree(void* p);
+mi_decl_externc size_t malloc_size(const void* p);
+mi_decl_externc size_t malloc_good_size(size_t size);
+#endif
+
+// helper definition for C override of C++ new
+typedef void* mi_nothrow_t;
 
 // ------------------------------------------------------
 // Override system malloc
 // ------------------------------------------------------
 
-#if (defined(__GNUC__) || defined(__clang__)) && !defined(__APPLE__)
-  // use aliasing to alias the exported function to one of our `mi_` functions
+#if (defined(__GNUC__) || defined(__clang__)) && !defined(__APPLE__) && !MI_TRACK_ENABLED
+  // gcc, clang: use aliasing to alias the exported function to one of our `mi_` functions
   #if (defined(__GNUC__) && __GNUC__ >= 9)
+    #pragma GCC diagnostic ignored "-Wattributes"  // or we get warnings that nodiscard is ignored on a forward
     #define MI_FORWARD(fun)      __attribute__((alias(#fun), used, visibility("default"), copy(fun)));
   #else
     #define MI_FORWARD(fun)      __attribute__((alias(#fun), used, visibility("default")));
@@ -32,7 +43,7 @@ terms of the MIT license. A copy of the license can be found in the file
   #define MI_FORWARD0(fun,x)      MI_FORWARD(fun)
   #define MI_FORWARD02(fun,x,y)   MI_FORWARD(fun)
 #else
-  // use forwarding by calling our `mi_` function
+  // otherwise use forwarding by calling our `mi_` function
   #define MI_FORWARD1(fun,x)      { return fun(x); }
   #define MI_FORWARD2(fun,x,y)    { return fun(x,y); }
   #define MI_FORWARD3(fun,x,y,z)  { return fun(x,y,z); }
@@ -40,7 +51,17 @@ terms of the MIT license. A copy of the license can be found in the file
   #define MI_FORWARD02(fun,x,y)   { fun(x,y); }
 #endif
 
-#if defined(__APPLE__) && defined(MI_SHARED_LIB_EXPORT) && defined(MI_INTERPOSE)
+
+#if defined(__APPLE__) && defined(MI_SHARED_LIB_EXPORT) && defined(MI_OSX_INTERPOSE)
+  // define MI_OSX_IS_INTERPOSED as we should not provide forwarding definitions for
+  // functions that are interposed (or the interposing does not work)
+  #define MI_OSX_IS_INTERPOSED
+
+  mi_decl_externc size_t mi_malloc_size_checked(void *p) {
+    if (!mi_is_in_heap_region(p)) return 0;
+    return mi_usable_size(p);
+  }
+
   // use interposing so `DYLD_INSERT_LIBRARIES` works without `DYLD_FORCE_FLAT_NAMESPACE=1`
   // See: <https://books.google.com/books?id=K8vUkpOXhN4C&pg=PA73>
   struct mi_interpose_s {
@@ -49,36 +70,79 @@ terms of the MIT license. A copy of the license can be found in the file
   };
   #define MI_INTERPOSE_FUN(oldfun,newfun) { (const void*)&newfun, (const void*)&oldfun }
   #define MI_INTERPOSE_MI(fun)            MI_INTERPOSE_FUN(fun,mi_##fun)
+
   __attribute__((used)) static struct mi_interpose_s _mi_interposes[]  __attribute__((section("__DATA, __interpose"))) =
   {
     MI_INTERPOSE_MI(malloc),
     MI_INTERPOSE_MI(calloc),
     MI_INTERPOSE_MI(realloc),
     MI_INTERPOSE_MI(strdup),
+    #if defined(MAC_OS_X_VERSION_10_7) && MAC_OS_X_VERSION_MAX_ALLOWED >= MAC_OS_X_VERSION_10_7
     MI_INTERPOSE_MI(strndup),
+    #endif
     MI_INTERPOSE_MI(realpath),
     MI_INTERPOSE_MI(posix_memalign),
     MI_INTERPOSE_MI(reallocf),
     MI_INTERPOSE_MI(valloc),
-    #ifndef MI_OSX_ZONE
-    // some code allocates from default zone but deallocates using plain free :-( (like NxHashResizeToCapacity <https://github.com/nneonneo/osx-10.9-opensource/blob/master/objc4-551.1/runtime/hashtable2.mm>)
-    MI_INTERPOSE_FUN(free,mi_cfree), // use safe free that checks if pointers are from us
-    #else
-    // We interpose malloc_default_zone in alloc-override-osx.c
-    MI_INTERPOSE_MI(free),
+    MI_INTERPOSE_FUN(malloc_size,mi_malloc_size_checked),
+    MI_INTERPOSE_MI(malloc_good_size),
+    #if defined(MAC_OS_X_VERSION_10_15) && MAC_OS_X_VERSION_MAX_ALLOWED >= MAC_OS_X_VERSION_10_15
+    MI_INTERPOSE_MI(aligned_alloc),
     #endif
-    // some code allocates from a zone but deallocates using plain free :-( (like NxHashResizeToCapacity <https://github.com/nneonneo/osx-10.9-opensource/blob/master/objc4-551.1/runtime/hashtable2.mm>)
+    #ifdef MI_OSX_ZONE
+    // we interpose malloc_default_zone in alloc-override-osx.c so we can use mi_free safely
+    MI_INTERPOSE_MI(free),
+    MI_INTERPOSE_FUN(vfree,mi_free),
+    #else
+    // sometimes code allocates from default zone but deallocates using plain free :-( (like NxHashResizeToCapacity <https://github.com/nneonneo/osx-10.9-opensource/blob/master/objc4-551.1/runtime/hashtable2.mm>)
     MI_INTERPOSE_FUN(free,mi_cfree), // use safe free that checks if pointers are from us
+    MI_INTERPOSE_FUN(vfree,mi_cfree),
+    #endif
   };
+
+  #ifdef __cplusplus
+  extern "C" {
+  #endif
+  void  _ZdlPv(void* p);   // delete
+  void  _ZdaPv(void* p);   // delete[]
+  void  _ZdlPvm(void* p, size_t n);  // delete
+  void  _ZdaPvm(void* p, size_t n);  // delete[]
+  void* _Znwm(size_t n);  // new
+  void* _Znam(size_t n);  // new[]
+  void* _ZnwmRKSt9nothrow_t(size_t n, mi_nothrow_t tag); // new nothrow
+  void* _ZnamRKSt9nothrow_t(size_t n, mi_nothrow_t tag); // new[] nothrow
+  #ifdef __cplusplus
+  }
+  #endif
+  __attribute__((used)) static struct mi_interpose_s _mi_cxx_interposes[]  __attribute__((section("__DATA, __interpose"))) =
+  {
+    MI_INTERPOSE_FUN(_ZdlPv,mi_free),
+    MI_INTERPOSE_FUN(_ZdaPv,mi_free),
+    MI_INTERPOSE_FUN(_ZdlPvm,mi_free_size),
+    MI_INTERPOSE_FUN(_ZdaPvm,mi_free_size),
+    MI_INTERPOSE_FUN(_Znwm,mi_new),
+    MI_INTERPOSE_FUN(_Znam,mi_new),
+    MI_INTERPOSE_FUN(_ZnwmRKSt9nothrow_t,mi_new_nothrow),
+    MI_INTERPOSE_FUN(_ZnamRKSt9nothrow_t,mi_new_nothrow),
+  };
+
 #elif defined(_MSC_VER)
   // cannot override malloc unless using a dll.
   // we just override new/delete which does work in a static library.
 #else
-  // On all other systems forward to our API
-  void* malloc(size_t size)              MI_FORWARD1(mi_malloc, size)
-  void* calloc(size_t size, size_t n)    MI_FORWARD2(mi_calloc, size, n)
-  void* realloc(void* p, size_t newsize) MI_FORWARD2(mi_realloc, p, newsize)
-  void  free(void* p)                    MI_FORWARD0(mi_free, p)
+  // On all other systems forward allocation primitives to our API
+  mi_decl_export void* malloc(size_t size)              MI_FORWARD1(mi_malloc, size)
+  mi_decl_export void* calloc(size_t size, size_t n)    MI_FORWARD2(mi_calloc, size, n)
+  mi_decl_export void* realloc(void* p, size_t newsize) MI_FORWARD2(mi_realloc, p, newsize)
+  mi_decl_export void  free(void* p)                    MI_FORWARD0(mi_free, p)  
+  // In principle we do not need to forward `strdup`/`strndup` but on some systems these do not use `malloc` internally (but a more primitive call)
+  // We only override if `strdup` is not a macro (as on some older libc's, see issue #885)
+  #if !defined(strdup)
+  mi_decl_export char* strdup(const char* str)             MI_FORWARD1(mi_strdup, str)
+  #endif
+  #if !defined(strndup) && (!defined(__APPLE__) || (defined(MAC_OS_X_VERSION_10_7) && MAC_OS_X_VERSION_MAX_ALLOWED >= MAC_OS_X_VERSION_10_7))
+  mi_decl_export char* strndup(const char* str, size_t n)  MI_FORWARD2(mi_strndup, str, n)   
+  #endif
 #endif
 
 #if (defined(__GNUC__) || defined(__clang__)) && !defined(__APPLE__)
@@ -96,18 +160,21 @@ terms of the MIT license. A copy of the license can be found in the file
   // see <https://en.cppreference.com/w/cpp/memory/new/operator_new>
   // ------------------------------------------------------
   #include <new>
-  void operator delete(void* p) noexcept              MI_FORWARD0(mi_free,p)
-  void operator delete[](void* p) noexcept            MI_FORWARD0(mi_free,p)
 
-  void* operator new(std::size_t n) noexcept(false)   MI_FORWARD1(mi_new,n)
-  void* operator new[](std::size_t n) noexcept(false) MI_FORWARD1(mi_new,n)
+  #ifndef MI_OSX_IS_INTERPOSED
+    void operator delete(void* p) noexcept              MI_FORWARD0(mi_free,p)
+    void operator delete[](void* p) noexcept            MI_FORWARD0(mi_free,p)
+
+    void* operator new(std::size_t n) noexcept(false)   MI_FORWARD1(mi_new,n)
+    void* operator new[](std::size_t n) noexcept(false) MI_FORWARD1(mi_new,n)
 
-  void* operator new  (std::size_t n, const std::nothrow_t& tag) noexcept { UNUSED(tag); return mi_new_nothrow(n); }
-  void* operator new[](std::size_t n, const std::nothrow_t& tag) noexcept { UNUSED(tag); return mi_new_nothrow(n); }
+    void* operator new  (std::size_t n, const std::nothrow_t& tag) noexcept { MI_UNUSED(tag); return mi_new_nothrow(n); }
+    void* operator new[](std::size_t n, const std::nothrow_t& tag) noexcept { MI_UNUSED(tag); return mi_new_nothrow(n); }
 
-  #if (__cplusplus >= 201402L || _MSC_VER >= 1916)
-  void operator delete  (void* p, std::size_t n) noexcept MI_FORWARD02(mi_free_size,p,n)
-  void operator delete[](void* p, std::size_t n) noexcept MI_FORWARD02(mi_free_size,p,n)
+    #if (__cplusplus >= 201402L || _MSC_VER >= 1916)
+    void operator delete  (void* p, std::size_t n) noexcept MI_FORWARD02(mi_free_size,p,n)
+    void operator delete[](void* p, std::size_t n) noexcept MI_FORWARD02(mi_free_size,p,n)
+    #endif
   #endif
 
   #if (__cplusplus > 201402L && defined(__cpp_aligned_new)) && (!defined(__GNUC__) || (__GNUC__ > 5))
@@ -115,6 +182,8 @@ terms of the MIT license. A copy of the license can be found in the file
   void operator delete[](void* p, std::align_val_t al) noexcept { mi_free_aligned(p, static_cast<size_t>(al)); }
   void operator delete  (void* p, std::size_t n, std::align_val_t al) noexcept { mi_free_size_aligned(p, n, static_cast<size_t>(al)); };
   void operator delete[](void* p, std::size_t n, std::align_val_t al) noexcept { mi_free_size_aligned(p, n, static_cast<size_t>(al)); };
+  void operator delete  (void* p, std::align_val_t al, const std::nothrow_t&) noexcept { mi_free_aligned(p, static_cast<size_t>(al)); }
+  void operator delete[](void* p, std::align_val_t al, const std::nothrow_t&) noexcept { mi_free_aligned(p, static_cast<size_t>(al)); }
 
   void* operator new( std::size_t n, std::align_val_t al)   noexcept(false) { return mi_new_aligned(n, static_cast<size_t>(al)); }
   void* operator new[]( std::size_t n, std::align_val_t al) noexcept(false) { return mi_new_aligned(n, static_cast<size_t>(al)); }
@@ -128,86 +197,109 @@ terms of the MIT license. A copy of the license can be found in the file
   // used by GCC and CLang).
   // See <https://itanium-cxx-abi.github.io/cxx-abi/abi.html#mangling>
   // ------------------------------------------------------
+
   void _ZdlPv(void* p)            MI_FORWARD0(mi_free,p) // delete
   void _ZdaPv(void* p)            MI_FORWARD0(mi_free,p) // delete[]
   void _ZdlPvm(void* p, size_t n) MI_FORWARD02(mi_free_size,p,n)
   void _ZdaPvm(void* p, size_t n) MI_FORWARD02(mi_free_size,p,n)
+  
   void _ZdlPvSt11align_val_t(void* p, size_t al)            { mi_free_aligned(p,al); }
   void _ZdaPvSt11align_val_t(void* p, size_t al)            { mi_free_aligned(p,al); }
   void _ZdlPvmSt11align_val_t(void* p, size_t n, size_t al) { mi_free_size_aligned(p,n,al); }
   void _ZdaPvmSt11align_val_t(void* p, size_t n, size_t al) { mi_free_size_aligned(p,n,al); }
 
-  typedef struct mi_nothrow_s { int _tag; } mi_nothrow_t;
+  void _ZdlPvRKSt9nothrow_t(void* p, mi_nothrow_t tag)      { MI_UNUSED(tag); mi_free(p); }  // operator delete(void*, std::nothrow_t const&) 
+  void _ZdaPvRKSt9nothrow_t(void* p, mi_nothrow_t tag)      { MI_UNUSED(tag); mi_free(p); }  // operator delete[](void*, std::nothrow_t const&)
+  void _ZdlPvSt11align_val_tRKSt9nothrow_t(void* p, size_t al, mi_nothrow_t tag) { MI_UNUSED(tag); mi_free_aligned(p,al); } // operator delete(void*, std::align_val_t, std::nothrow_t const&) 
+  void _ZdaPvSt11align_val_tRKSt9nothrow_t(void* p, size_t al, mi_nothrow_t tag) { MI_UNUSED(tag); mi_free_aligned(p,al); } // operator delete[](void*, std::align_val_t, std::nothrow_t const&) 
+  
   #if (MI_INTPTR_SIZE==8)
     void* _Znwm(size_t n)                             MI_FORWARD1(mi_new,n)  // new 64-bit
     void* _Znam(size_t n)                             MI_FORWARD1(mi_new,n)  // new[] 64-bit
+    void* _ZnwmRKSt9nothrow_t(size_t n, mi_nothrow_t tag) { MI_UNUSED(tag); return mi_new_nothrow(n); }
+    void* _ZnamRKSt9nothrow_t(size_t n, mi_nothrow_t tag) { MI_UNUSED(tag); return mi_new_nothrow(n); }
     void* _ZnwmSt11align_val_t(size_t n, size_t al)   MI_FORWARD2(mi_new_aligned, n, al)
     void* _ZnamSt11align_val_t(size_t n, size_t al)   MI_FORWARD2(mi_new_aligned, n, al)
-    void* _ZnwmRKSt9nothrow_t(size_t n, mi_nothrow_t tag) { UNUSED(tag); return mi_new_nothrow(n); }
-    void* _ZnamRKSt9nothrow_t(size_t n, mi_nothrow_t tag) { UNUSED(tag); return mi_new_nothrow(n); }
-    void* _ZnwmSt11align_val_tRKSt9nothrow_t(size_t n, size_t al, mi_nothrow_t tag) { UNUSED(tag); return mi_new_aligned_nothrow(n,al); }
-    void* _ZnamSt11align_val_tRKSt9nothrow_t(size_t n, size_t al, mi_nothrow_t tag) { UNUSED(tag); return mi_new_aligned_nothrow(n,al); }
+    void* _ZnwmSt11align_val_tRKSt9nothrow_t(size_t n, size_t al, mi_nothrow_t tag) { MI_UNUSED(tag); return mi_new_aligned_nothrow(n,al); }
+    void* _ZnamSt11align_val_tRKSt9nothrow_t(size_t n, size_t al, mi_nothrow_t tag) { MI_UNUSED(tag); return mi_new_aligned_nothrow(n,al); }
   #elif (MI_INTPTR_SIZE==4)
     void* _Znwj(size_t n)                             MI_FORWARD1(mi_new,n)  // new 64-bit
     void* _Znaj(size_t n)                             MI_FORWARD1(mi_new,n)  // new[] 64-bit
+    void* _ZnwjRKSt9nothrow_t(size_t n, mi_nothrow_t tag) { MI_UNUSED(tag); return mi_new_nothrow(n); }
+    void* _ZnajRKSt9nothrow_t(size_t n, mi_nothrow_t tag) { MI_UNUSED(tag); return mi_new_nothrow(n); }
     void* _ZnwjSt11align_val_t(size_t n, size_t al)   MI_FORWARD2(mi_new_aligned, n, al)
     void* _ZnajSt11align_val_t(size_t n, size_t al)   MI_FORWARD2(mi_new_aligned, n, al)
-    void* _ZnwjRKSt9nothrow_t(size_t n, mi_nothrow_t tag) { UNUSED(tag); return mi_new_nothrow(n); }
-    void* _ZnajRKSt9nothrow_t(size_t n, mi_nothrow_t tag) { UNUSED(tag); return mi_new_nothrow(n); }
-    void* _ZnwjSt11align_val_tRKSt9nothrow_t(size_t n, size_t al, mi_nothrow_t tag) { UNUSED(tag); return mi_new_aligned_nothrow(n,al); }
-    void* _ZnajSt11align_val_tRKSt9nothrow_t(size_t n, size_t al, mi_nothrow_t tag) { UNUSED(tag); return mi_new_aligned_nothrow(n,al); }
+    void* _ZnwjSt11align_val_tRKSt9nothrow_t(size_t n, size_t al, mi_nothrow_t tag) { MI_UNUSED(tag); return mi_new_aligned_nothrow(n,al); }
+    void* _ZnajSt11align_val_tRKSt9nothrow_t(size_t n, size_t al, mi_nothrow_t tag) { MI_UNUSED(tag); return mi_new_aligned_nothrow(n,al); }
   #else
-  #error "define overloads for new/delete for this platform (just for performance, can be skipped)"
+    #error "define overloads for new/delete for this platform (just for performance, can be skipped)"
   #endif
 #endif // __cplusplus
 
+// ------------------------------------------------------
+// Further Posix & Unix functions definitions
+// ------------------------------------------------------
 
 #ifdef __cplusplus
 extern "C" {
 #endif
 
-// ------------------------------------------------------
-// Posix & Unix functions definitions
-// ------------------------------------------------------
+#ifndef MI_OSX_IS_INTERPOSED
+  // Forward Posix/Unix calls as well
+  void*  reallocf(void* p, size_t newsize) MI_FORWARD2(mi_reallocf,p,newsize)
+  size_t malloc_size(const void* p)        MI_FORWARD1(mi_usable_size,p)
+  #if !defined(__ANDROID__) && !defined(__FreeBSD__)
+  size_t malloc_usable_size(void *p)       MI_FORWARD1(mi_usable_size,p)
+  #else
+  size_t malloc_usable_size(const void *p) MI_FORWARD1(mi_usable_size,p)
+  #endif
 
-void   cfree(void* p)                    MI_FORWARD0(mi_free, p)
-void*  reallocf(void* p, size_t newsize) MI_FORWARD2(mi_reallocf,p,newsize)
-size_t malloc_size(const void* p)        MI_FORWARD1(mi_usable_size,p)
-#if !defined(__ANDROID__)
-size_t malloc_usable_size(void *p)       MI_FORWARD1(mi_usable_size,p)
-#else
-size_t malloc_usable_size(const void *p) MI_FORWARD1(mi_usable_size,p)
+  // No forwarding here due to aliasing/name mangling issues
+  void*  valloc(size_t size)               { return mi_valloc(size); }
+  void   vfree(void* p)                    { mi_free(p); }
+  size_t malloc_good_size(size_t size)     { return mi_malloc_good_size(size); }
+  int    posix_memalign(void** p, size_t alignment, size_t size) { return mi_posix_memalign(p, alignment, size); }
+
+  // `aligned_alloc` is only available when __USE_ISOC11 is defined.
+  // Note: it seems __USE_ISOC11 is not defined in musl (and perhaps other libc's) so we only check
+  // for it if using glibc.
+  // Note: Conda has a custom glibc where `aligned_alloc` is declared `static inline` and we cannot
+  // override it, but both _ISOC11_SOURCE and __USE_ISOC11 are undefined in Conda GCC7 or GCC9.
+  // Fortunately, in the case where `aligned_alloc` is declared as `static inline` it
+  // uses internally `memalign`, `posix_memalign`, or `_aligned_malloc` so we  can avoid overriding it ourselves.
+  #if !defined(__GLIBC__) || __USE_ISOC11
+  void* aligned_alloc(size_t alignment, size_t size) { return mi_aligned_alloc(alignment, size); }
+  #endif
 #endif
 
 // no forwarding here due to aliasing/name mangling issues
-void* valloc(size_t size)                                     { return mi_valloc(size); }
-void* pvalloc(size_t size)                                    { return mi_pvalloc(size); }
-void* reallocarray(void* p, size_t count, size_t size)        { return mi_reallocarray(p, count, size); }
-void* memalign(size_t alignment, size_t size)                 { return mi_memalign(alignment, size); }
-int   posix_memalign(void** p, size_t alignment, size_t size) { return mi_posix_memalign(p, alignment, size); }
-void* _aligned_malloc(size_t alignment, size_t size)          { return mi_aligned_alloc(alignment, size); }
-
-// `aligned_alloc` is only available when __USE_ISOC11 is defined.
-// Note: Conda has a custom glibc where `aligned_alloc` is declared `static inline` and we cannot
-// override it, but both _ISOC11_SOURCE and __USE_ISOC11 are undefined in Conda GCC7 or GCC9.
-// Fortunately, in the case where `aligned_alloc` is declared as `static inline` it
-// uses internally `memalign`, `posix_memalign`, or `_aligned_malloc` so we  can avoid overriding it ourselves.
-#if __USE_ISOC11 
-void* aligned_alloc(size_t alignment, size_t size)   { return mi_aligned_alloc(alignment, size); }
-#endif
+void  cfree(void* p)                                    { mi_free(p); }
+void* pvalloc(size_t size)                              { return mi_pvalloc(size); }
+void* memalign(size_t alignment, size_t size)           { return mi_memalign(alignment, size); }
+void* _aligned_malloc(size_t alignment, size_t size)    { return mi_aligned_alloc(alignment, size); }
+void* reallocarray(void* p, size_t count, size_t size)  { return mi_reallocarray(p, count, size); }
+// some systems define reallocarr so mark it as a weak symbol (#751)
+mi_decl_weak int reallocarr(void* p, size_t count, size_t size)    { return mi_reallocarr(p, count, size); }
 
+#if defined(__wasi__)
+  // forward __libc interface (see PR #667)
+  void* __libc_malloc(size_t size)                      MI_FORWARD1(mi_malloc, size)
+  void* __libc_calloc(size_t count, size_t size)        MI_FORWARD2(mi_calloc, count, size)
+  void* __libc_realloc(void* p, size_t size)            MI_FORWARD2(mi_realloc, p, size)
+  void  __libc_free(void* p)                            MI_FORWARD0(mi_free, p)
+  void* __libc_memalign(size_t alignment, size_t size)  { return mi_memalign(alignment, size); }
 
-#if defined(__GLIBC__) && defined(__linux__)
+#elif defined(__GLIBC__) && defined(__linux__)
   // forward __libc interface (needed for glibc-based Linux distributions)
-  void* __libc_malloc(size_t size)                  MI_FORWARD1(mi_malloc,size)
-  void* __libc_calloc(size_t count, size_t size)    MI_FORWARD2(mi_calloc,count,size)
-  void* __libc_realloc(void* p, size_t size)        MI_FORWARD2(mi_realloc,p,size)
-  void  __libc_free(void* p)                        MI_FORWARD0(mi_free,p)
-  void  __libc_cfree(void* p)                       MI_FORWARD0(mi_free,p)
-
-  void* __libc_valloc(size_t size)                                { return mi_valloc(size); }
-  void* __libc_pvalloc(size_t size)                               { return mi_pvalloc(size); }
-  void* __libc_memalign(size_t alignment, size_t size)            { return mi_memalign(alignment,size); }
+  void* __libc_malloc(size_t size)                      MI_FORWARD1(mi_malloc,size)
+  void* __libc_calloc(size_t count, size_t size)        MI_FORWARD2(mi_calloc,count,size)
+  void* __libc_realloc(void* p, size_t size)            MI_FORWARD2(mi_realloc,p,size)
+  void  __libc_free(void* p)                            MI_FORWARD0(mi_free,p)
+  void  __libc_cfree(void* p)                           MI_FORWARD0(mi_free,p)
+
+  void* __libc_valloc(size_t size)                      { return mi_valloc(size); }
+  void* __libc_pvalloc(size_t size)                     { return mi_pvalloc(size); }
+  void* __libc_memalign(size_t alignment, size_t size)  { return mi_memalign(alignment,size); }
   int   __posix_memalign(void** p, size_t alignment, size_t size) { return mi_posix_memalign(p,alignment,size); }
 #endif
 
diff --git a/contrib/libs/mimalloc/src/alloc-posix.c b/contrib/libs/mimalloc/src/alloc-posix.c
index 43931e56dadb..225752fd8707 100644
--- a/contrib/libs/mimalloc/src/alloc-posix.c
+++ b/contrib/libs/mimalloc/src/alloc-posix.c
@@ -10,7 +10,7 @@ terms of the MIT license. A copy of the license can be found in the file
 // for convenience and used when overriding these functions.
 // ------------------------------------------------------------------------
 #include "mimalloc.h"
-#include "mimalloc-internal.h"
+#include "mimalloc/internal.h"
 
 // ------------------------------------------------------
 // Posix & Unix functions definitions
@@ -32,14 +32,20 @@ terms of the MIT license. A copy of the license can be found in the file
 #endif
 
 
-size_t mi_malloc_size(const void* p) mi_attr_noexcept {
+mi_decl_nodiscard size_t mi_malloc_size(const void* p) mi_attr_noexcept {
+  // if (!mi_is_in_heap_region(p)) return 0;
   return mi_usable_size(p);
 }
 
-size_t mi_malloc_usable_size(const void *p) mi_attr_noexcept {
+mi_decl_nodiscard size_t mi_malloc_usable_size(const void *p) mi_attr_noexcept {
+  // if (!mi_is_in_heap_region(p)) return 0;
   return mi_usable_size(p);
 }
 
+mi_decl_nodiscard size_t mi_malloc_good_size(size_t size) mi_attr_noexcept {
+  return mi_good_size(size);
+}
+
 void mi_cfree(void* p) mi_attr_noexcept {
   if (mi_is_in_heap_region(p)) {
     mi_free(p);
@@ -50,53 +56,75 @@ int mi_posix_memalign(void** p, size_t alignment, size_t size) mi_attr_noexcept
   // Note: The spec dictates we should not modify `*p` on an error. (issue#27)
   // <http://man7.org/linux/man-pages/man3/posix_memalign.3.html>
   if (p == NULL) return EINVAL;
-  if (alignment % sizeof(void*) != 0) return EINVAL;   // natural alignment
-  if (!_mi_is_power_of_two(alignment)) return EINVAL;  // not a power of 2
-  void* q = (mi_malloc_satisfies_alignment(alignment, size) ? mi_malloc(size) : mi_malloc_aligned(size, alignment));
+  if ((alignment % sizeof(void*)) != 0) return EINVAL;                 // natural alignment
+  // it is also required that alignment is a power of 2 and > 0; this is checked in `mi_malloc_aligned`
+  if (alignment==0 || !_mi_is_power_of_two(alignment)) return EINVAL;  // not a power of 2
+  void* q = mi_malloc_aligned(size, alignment);
   if (q==NULL && size != 0) return ENOMEM;
   mi_assert_internal(((uintptr_t)q % alignment) == 0);
   *p = q;
   return 0;
 }
 
-mi_decl_restrict void* mi_memalign(size_t alignment, size_t size) mi_attr_noexcept {
-  void* p = (mi_malloc_satisfies_alignment(alignment,size) ? mi_malloc(size) : mi_malloc_aligned(size, alignment));
+mi_decl_nodiscard mi_decl_restrict void* mi_memalign(size_t alignment, size_t size) mi_attr_noexcept {
+  void* p = mi_malloc_aligned(size, alignment);
   mi_assert_internal(((uintptr_t)p % alignment) == 0);
   return p;
 }
 
-mi_decl_restrict void* mi_valloc(size_t size) mi_attr_noexcept {
+mi_decl_nodiscard mi_decl_restrict void* mi_valloc(size_t size) mi_attr_noexcept {
   return mi_memalign( _mi_os_page_size(), size );
 }
 
-mi_decl_restrict void* mi_pvalloc(size_t size) mi_attr_noexcept {
+mi_decl_nodiscard mi_decl_restrict void* mi_pvalloc(size_t size) mi_attr_noexcept {
   size_t psize = _mi_os_page_size();
   if (size >= SIZE_MAX - psize) return NULL; // overflow
   size_t asize = _mi_align_up(size, psize);
   return mi_malloc_aligned(asize, psize);
 }
 
-mi_decl_restrict void* mi_aligned_alloc(size_t alignment, size_t size) mi_attr_noexcept {
-  if (alignment==0 || !_mi_is_power_of_two(alignment)) return NULL; 
-  if ((size&(alignment-1)) != 0) return NULL; // C11 requires integral multiple, see <https://en.cppreference.com/w/c/memory/aligned_alloc>
-  void* p = (mi_malloc_satisfies_alignment(alignment, size) ? mi_malloc(size) : mi_malloc_aligned(size, alignment));
+mi_decl_nodiscard mi_decl_restrict void* mi_aligned_alloc(size_t alignment, size_t size) mi_attr_noexcept {
+  // C11 requires the size to be an integral multiple of the alignment, see <https://en.cppreference.com/w/c/memory/aligned_alloc>.
+  // unfortunately, it turns out quite some programs pass a size that is not an integral multiple so skip this check..
+  /* if mi_unlikely((size & (alignment - 1)) != 0) { // C11 requires alignment>0 && integral multiple, see <https://en.cppreference.com/w/c/memory/aligned_alloc>
+      #if MI_DEBUG > 0
+      _mi_error_message(EOVERFLOW, "(mi_)aligned_alloc requires the size to be an integral multiple of the alignment (size %zu, alignment %zu)\n", size, alignment);
+      #endif
+      return NULL;
+    }
+  */
+  // C11 also requires alignment to be a power-of-two (and > 0) which is checked in mi_malloc_aligned
+  void* p = mi_malloc_aligned(size, alignment);
   mi_assert_internal(((uintptr_t)p % alignment) == 0);
   return p;
 }
 
-void* mi_reallocarray( void* p, size_t count, size_t size ) mi_attr_noexcept {  // BSD
+mi_decl_nodiscard void* mi_reallocarray( void* p, size_t count, size_t size ) mi_attr_noexcept {  // BSD
   void* newp = mi_reallocn(p,count,size);
-  if (newp==NULL) errno = ENOMEM;
+  if (newp==NULL) { errno = ENOMEM; }
   return newp;
 }
 
+mi_decl_nodiscard int mi_reallocarr( void* p, size_t count, size_t size ) mi_attr_noexcept { // NetBSD
+  mi_assert(p != NULL);
+  if (p == NULL) {
+    errno = EINVAL;
+    return EINVAL;
+  }
+  void** op = (void**)p;
+  void* newp = mi_reallocarray(*op, count, size);
+  if mi_unlikely(newp == NULL) { return errno; }
+  *op = newp;
+  return 0;
+}
+
 void* mi__expand(void* p, size_t newsize) mi_attr_noexcept {  // Microsoft
   void* res = mi_expand(p, newsize);
-  if (res == NULL) errno = ENOMEM;
+  if (res == NULL) { errno = ENOMEM; }
   return res;
 }
 
-mi_decl_restrict unsigned short* mi_wcsdup(const unsigned short* s) mi_attr_noexcept {
+mi_decl_nodiscard mi_decl_restrict unsigned short* mi_wcsdup(const unsigned short* s) mi_attr_noexcept {
   if (s==NULL) return NULL;
   size_t len;
   for(len = 0; s[len] != 0; len++) { }
@@ -108,7 +136,7 @@ mi_decl_restrict unsigned short* mi_wcsdup(const unsigned short* s) mi_attr_noex
   return p;
 }
 
-mi_decl_restrict unsigned char* mi_mbsdup(const unsigned char* s)  mi_attr_noexcept {
+mi_decl_nodiscard mi_decl_restrict unsigned char* mi_mbsdup(const unsigned char* s)  mi_attr_noexcept {
   return (unsigned char*)mi_strdup((const char*)s);
 }
 
@@ -122,7 +150,7 @@ int mi_dupenv_s(char** buf, size_t* size, const char* name) mi_attr_noexcept {
   else {
     *buf = mi_strdup(p);
     if (*buf==NULL) return ENOMEM;
-    if (size != NULL) *size = strlen(p);
+    if (size != NULL) *size = _mi_strlen(p);
   }
   return 0;
 }
@@ -148,10 +176,10 @@ int mi_wdupenv_s(unsigned short** buf, size_t* size, const unsigned short* name)
 #endif
 }
 
-void* mi_aligned_offset_recalloc(void* p, size_t newcount, size_t size, size_t alignment, size_t offset) mi_attr_noexcept { // Microsoft
+mi_decl_nodiscard void* mi_aligned_offset_recalloc(void* p, size_t newcount, size_t size, size_t alignment, size_t offset) mi_attr_noexcept { // Microsoft
   return mi_recalloc_aligned_at(p, newcount, size, alignment, offset);
 }
 
-void* mi_aligned_recalloc(void* p, size_t newcount, size_t size, size_t alignment) mi_attr_noexcept { // Microsoft
+mi_decl_nodiscard void* mi_aligned_recalloc(void* p, size_t newcount, size_t size, size_t alignment) mi_attr_noexcept { // Microsoft
   return mi_recalloc_aligned(p, newcount, size, alignment);
 }
diff --git a/contrib/libs/mimalloc/src/alloc.c b/contrib/libs/mimalloc/src/alloc.c
index 8acff78327c3..6c9c5baf3618 100644
--- a/contrib/libs/mimalloc/src/alloc.c
+++ b/contrib/libs/mimalloc/src/alloc.c
@@ -1,18 +1,24 @@
 /* ----------------------------------------------------------------------------
-Copyright (c) 2018-2021, Microsoft Research, Daan Leijen
+Copyright (c) 2018-2024, Microsoft Research, Daan Leijen
 This is free software; you can redistribute it and/or modify it under the
 terms of the MIT license. A copy of the license can be found in the file
 "LICENSE" at the root of this distribution.
 -----------------------------------------------------------------------------*/
+#ifndef _DEFAULT_SOURCE
+#define _DEFAULT_SOURCE   // for realpath() on Linux
+#endif
+
 #include "mimalloc.h"
-#include "mimalloc-internal.h"
-#include "mimalloc-atomic.h"
+#include "mimalloc/internal.h"
+#include "mimalloc/atomic.h"
+#include "mimalloc/prim.h"   // _mi_prim_thread_id()
 
-#include <string.h>  // memset, strlen
-#include <stdlib.h>  // malloc, exit
+#include <string.h>      // memset, strlen (for mi_strdup)
+#include <stdlib.h>      // malloc, abort
 
 #define MI_IN_ALLOC_C
 #include "alloc-override.c"
+#include "free.c"
 #undef MI_IN_ALLOC_C
 
 // ------------------------------------------------------
@@ -21,625 +27,254 @@ terms of the MIT license. A copy of the license can be found in the file
 
 // Fast allocation in a page: just pop from the free list.
 // Fall back to generic allocation only if the list is empty.
-extern inline void* _mi_page_malloc(mi_heap_t* heap, mi_page_t* page, size_t size) mi_attr_noexcept {
-  mi_assert_internal(page->xblock_size==0||mi_page_block_size(page) >= size);
+// Note: in release mode the (inlined) routine is about 7 instructions with a single test.
+extern inline void* _mi_page_malloc_zero(mi_heap_t* heap, mi_page_t* page, size_t size, bool zero) mi_attr_noexcept 
+{
+  mi_assert_internal(page->block_size == 0 /* empty heap */ || mi_page_block_size(page) >= size);
   mi_block_t* const block = page->free;
-  if (mi_unlikely(block == NULL)) {
-    return _mi_malloc_generic(heap, size); 
+  if mi_unlikely(block == NULL) {
+    return _mi_malloc_generic(heap, size, zero, 0);
   }
   mi_assert_internal(block != NULL && _mi_ptr_page(block) == page);
   // pop from the free list
-  page->used++;
   page->free = mi_block_next(page, block);
+  page->used++;
   mi_assert_internal(page->free == NULL || _mi_ptr_page(page->free) == page);
+  #if MI_DEBUG>3
+  if (page->free_is_zero) {
+    mi_assert_expensive(mi_mem_is_zero(block+1,size - sizeof(*block)));
+  }
+  #endif
 
-#if (MI_DEBUG>0)
-  if (!page->is_zero) { memset(block, MI_DEBUG_UNINIT, size); }
-#elif (MI_SECURE!=0)
-  block->next = 0;  // don't leak internal data
-#endif
+  // allow use of the block internally
+  // note: when tracking we need to avoid ever touching the MI_PADDING since
+  // that is tracked by valgrind etc. as non-accessible (through the red-zone, see `mimalloc/track.h`)
+  mi_track_mem_undefined(block, mi_page_usable_block_size(page));
+
+  // zero the block? note: we need to zero the full block size (issue #63)
+  if mi_unlikely(zero) {
+    mi_assert_internal(page->block_size != 0); // do not call with zero'ing for huge blocks (see _mi_malloc_generic)
+    mi_assert_internal(page->block_size >= MI_PADDING_SIZE);
+    if (page->free_is_zero) {
+      block->next = 0;
+      mi_track_mem_defined(block, page->block_size - MI_PADDING_SIZE);
+    }
+    else {
+      _mi_memzero_aligned(block, page->block_size - MI_PADDING_SIZE);
+    }    
+  }
+
+  #if (MI_DEBUG>0) && !MI_TRACK_ENABLED && !MI_TSAN
+  if (!zero && !mi_page_is_huge(page)) {
+    memset(block, MI_DEBUG_UNINIT, mi_page_usable_block_size(page));
+  }
+  #elif (MI_SECURE!=0)
+  if (!zero) { block->next = 0; } // don't leak internal data
+  #endif
 
-#if (MI_STAT>0)
+  #if (MI_STAT>0)
   const size_t bsize = mi_page_usable_block_size(page);
   if (bsize <= MI_LARGE_OBJ_SIZE_MAX) {
     mi_heap_stat_increase(heap, normal, bsize);
     mi_heap_stat_counter_increase(heap, normal_count, 1);
-#if (MI_STAT>1)
+    #if (MI_STAT>1)
     const size_t bin = _mi_bin(bsize);
     mi_heap_stat_increase(heap, normal_bins[bin], 1);
-#endif
+    #endif
   }
-#endif
+  #endif
 
-#if (MI_PADDING > 0) && defined(MI_ENCODE_FREELIST)
-  mi_padding_t* const padding = (mi_padding_t*)((uint8_t*)block + mi_page_usable_block_size(page));
-  ptrdiff_t delta = ((uint8_t*)padding - (uint8_t*)block - (size - MI_PADDING_SIZE));
-  mi_assert_internal(delta >= 0 && mi_page_usable_block_size(page) >= (size - MI_PADDING_SIZE + delta));
-  padding->canary = (uint32_t)(mi_ptr_encode(page,block,page->keys));
-  padding->delta  = (uint32_t)(delta);
-  uint8_t* fill = (uint8_t*)padding - delta;
-  const size_t maxpad = (delta > MI_MAX_ALIGN_SIZE ? MI_MAX_ALIGN_SIZE : delta); // set at most N initial padding bytes
-  for (size_t i = 0; i < maxpad; i++) { fill[i] = MI_DEBUG_PADDING; }
-#endif
+  #if MI_PADDING // && !MI_TRACK_ENABLED
+    mi_padding_t* const padding = (mi_padding_t*)((uint8_t*)block + mi_page_usable_block_size(page));
+    ptrdiff_t delta = ((uint8_t*)padding - (uint8_t*)block - (size - MI_PADDING_SIZE));
+    #if (MI_DEBUG>=2)
+    mi_assert_internal(delta >= 0 && mi_page_usable_block_size(page) >= (size - MI_PADDING_SIZE + delta));
+    #endif
+    mi_track_mem_defined(padding,sizeof(mi_padding_t));  // note: re-enable since mi_page_usable_block_size may set noaccess
+    padding->canary = (uint32_t)(mi_ptr_encode(page,block,page->keys));
+    padding->delta  = (uint32_t)(delta);
+    #if MI_PADDING_CHECK
+    if (!mi_page_is_huge(page)) {
+      uint8_t* fill = (uint8_t*)padding - delta;
+      const size_t maxpad = (delta > MI_MAX_ALIGN_SIZE ? MI_MAX_ALIGN_SIZE : delta); // set at most N initial padding bytes
+      for (size_t i = 0; i < maxpad; i++) { fill[i] = MI_DEBUG_PADDING; }
+    }
+    #endif
+  #endif
 
   return block;
 }
 
-// allocate a small block
-extern inline mi_decl_restrict void* mi_heap_malloc_small(mi_heap_t* heap, size_t size) mi_attr_noexcept {
-  mi_assert(heap!=NULL);
-  mi_assert(heap->thread_id == 0 || heap->thread_id == _mi_thread_id()); // heaps are thread local
+// extra entries for improved efficiency in `alloc-aligned.c`.
+extern void* _mi_page_malloc(mi_heap_t* heap, mi_page_t* page, size_t size) mi_attr_noexcept {
+  return _mi_page_malloc_zero(heap,page,size,false);
+}
+extern void* _mi_page_malloc_zeroed(mi_heap_t* heap, mi_page_t* page, size_t size) mi_attr_noexcept {
+  return _mi_page_malloc_zero(heap,page,size,true);
+}
+
+static inline mi_decl_restrict void* mi_heap_malloc_small_zero(mi_heap_t* heap, size_t size, bool zero) mi_attr_noexcept {
+  mi_assert(heap != NULL);
+  #if MI_DEBUG
+  const uintptr_t tid = _mi_thread_id();
+  mi_assert(heap->thread_id == 0 || heap->thread_id == tid); // heaps are thread local
+  #endif
   mi_assert(size <= MI_SMALL_SIZE_MAX);
   #if (MI_PADDING)
-  if (size == 0) {
-    size = sizeof(void*);
-  }
+  if (size == 0) { size = sizeof(void*); }
   #endif
-  mi_page_t* page = _mi_heap_get_free_small_page(heap,size + MI_PADDING_SIZE);
-  void* p = _mi_page_malloc(heap, page, size + MI_PADDING_SIZE);
-  mi_assert_internal(p==NULL || mi_usable_size(p) >= size);
+  
+  mi_page_t* page = _mi_heap_get_free_small_page(heap, size + MI_PADDING_SIZE);
+  void* const p = _mi_page_malloc_zero(heap, page, size + MI_PADDING_SIZE, zero);  
+  mi_track_malloc(p,size,zero);
+
   #if MI_STAT>1
   if (p != NULL) {
-    if (!mi_heap_is_initialized(heap)) { heap = mi_get_default_heap(); }
+    if (!mi_heap_is_initialized(heap)) { heap = mi_prim_get_default_heap(); }
     mi_heap_stat_increase(heap, malloc, mi_usable_size(p));
   }
   #endif
+  #if MI_DEBUG>3
+  if (p != NULL && zero) {
+    mi_assert_expensive(mi_mem_is_zero(p, size));
+  }
+  #endif
   return p;
 }
 
-extern inline mi_decl_restrict void* mi_malloc_small(size_t size) mi_attr_noexcept {
-  return mi_heap_malloc_small(mi_get_default_heap(), size);
+// allocate a small block
+mi_decl_nodiscard extern inline mi_decl_restrict void* mi_heap_malloc_small(mi_heap_t* heap, size_t size) mi_attr_noexcept {
+  return mi_heap_malloc_small_zero(heap, size, false);
+}
+
+mi_decl_nodiscard extern inline mi_decl_restrict void* mi_malloc_small(size_t size) mi_attr_noexcept {
+  return mi_heap_malloc_small(mi_prim_get_default_heap(), size);
 }
 
 // The main allocation function
-extern inline mi_decl_restrict void* mi_heap_malloc(mi_heap_t* heap, size_t size) mi_attr_noexcept {
-  if (mi_likely(size <= MI_SMALL_SIZE_MAX)) {
-    return mi_heap_malloc_small(heap, size);
+extern inline void* _mi_heap_malloc_zero_ex(mi_heap_t* heap, size_t size, bool zero, size_t huge_alignment) mi_attr_noexcept {
+  if mi_likely(size <= MI_SMALL_SIZE_MAX) {
+    mi_assert_internal(huge_alignment == 0);
+    return mi_heap_malloc_small_zero(heap, size, zero);
   }
   else {
     mi_assert(heap!=NULL);
-    mi_assert(heap->thread_id == 0 || heap->thread_id == _mi_thread_id()); // heaps are thread local
-    void* const p = _mi_malloc_generic(heap, size + MI_PADDING_SIZE);      // note: size can overflow but it is detected in malloc_generic
-    mi_assert_internal(p == NULL || mi_usable_size(p) >= size);
+    mi_assert(heap->thread_id == 0 || heap->thread_id == _mi_thread_id());   // heaps are thread local
+    void* const p = _mi_malloc_generic(heap, size + MI_PADDING_SIZE, zero, huge_alignment);  // note: size can overflow but it is detected in malloc_generic
+    mi_track_malloc(p,size,zero);
     #if MI_STAT>1
     if (p != NULL) {
-      if (!mi_heap_is_initialized(heap)) { heap = mi_get_default_heap(); }
+      if (!mi_heap_is_initialized(heap)) { heap = mi_prim_get_default_heap(); }
       mi_heap_stat_increase(heap, malloc, mi_usable_size(p));
     }
     #endif
-    return p;
-  }
-}
-
-extern inline mi_decl_restrict void* mi_malloc(size_t size) mi_attr_noexcept {
-  return mi_heap_malloc(mi_get_default_heap(), size);
-}
-
-
-void _mi_block_zero_init(const mi_page_t* page, void* p, size_t size) {
-  // note: we need to initialize the whole usable block size to zero, not just the requested size,
-  // or the recalloc/rezalloc functions cannot safely expand in place (see issue #63)
-  UNUSED(size);
-  mi_assert_internal(p != NULL);
-  mi_assert_internal(mi_usable_size(p) >= size); // size can be zero
-  mi_assert_internal(_mi_ptr_page(p)==page);
-  if (page->is_zero && size > sizeof(mi_block_t)) {
-    // already zero initialized memory
-    ((mi_block_t*)p)->next = 0;  // clear the free list pointer
-    mi_assert_expensive(mi_mem_is_zero(p, mi_usable_size(p)));
-  }
-  else {
-    // otherwise memset
-    memset(p, 0, mi_usable_size(p));
-  }
-}
-
-// zero initialized small block
-mi_decl_restrict void* mi_zalloc_small(size_t size) mi_attr_noexcept {
-  void* p = mi_malloc_small(size);
-  if (p != NULL) {
-    _mi_block_zero_init(_mi_ptr_page(p), p, size);  // todo: can we avoid getting the page again?
-  }
-  return p;
-}
-
-void* _mi_heap_malloc_zero(mi_heap_t* heap, size_t size, bool zero) {
-  void* p = mi_heap_malloc(heap,size);
-  if (zero && p != NULL) {
-    _mi_block_zero_init(_mi_ptr_page(p),p,size);  // todo: can we avoid getting the page again?
-  }
-  return p;
-}
-
-extern inline mi_decl_restrict void* mi_heap_zalloc(mi_heap_t* heap, size_t size) mi_attr_noexcept {
-  return _mi_heap_malloc_zero(heap, size, true);
-}
-
-mi_decl_restrict void* mi_zalloc(size_t size) mi_attr_noexcept {
-  return mi_heap_zalloc(mi_get_default_heap(),size);
-}
-
-
-// ------------------------------------------------------
-// Check for double free in secure and debug mode
-// This is somewhat expensive so only enabled for secure mode 4
-// ------------------------------------------------------
-
-#if (MI_ENCODE_FREELIST && (MI_SECURE>=4 || MI_DEBUG!=0))
-// linear check if the free list contains a specific element
-static bool mi_list_contains(const mi_page_t* page, const mi_block_t* list, const mi_block_t* elem) {
-  while (list != NULL) {
-    if (elem==list) return true;
-    list = mi_block_next(page, list);
-  }
-  return false;
-}
-
-static mi_decl_noinline bool mi_check_is_double_freex(const mi_page_t* page, const mi_block_t* block) {
-  // The decoded value is in the same page (or NULL).
-  // Walk the free lists to verify positively if it is already freed
-  if (mi_list_contains(page, page->free, block) ||
-      mi_list_contains(page, page->local_free, block) ||
-      mi_list_contains(page, mi_page_thread_free(page), block))
-  {
-    _mi_error_message(EAGAIN, "double free detected of block %p with size %zu\n", block, mi_page_block_size(page));
-    return true;
-  }
-  return false;
-}
-
-static inline bool mi_check_is_double_free(const mi_page_t* page, const mi_block_t* block) {
-  mi_block_t* n = mi_block_nextx(page, block, page->keys); // pretend it is freed, and get the decoded first field
-  if (((uintptr_t)n & (MI_INTPTR_SIZE-1))==0 &&  // quick check: aligned pointer?
-      (n==NULL || mi_is_in_same_page(block, n))) // quick check: in same page or NULL?
-  {
-    // Suspicous: decoded value a in block is in the same page (or NULL) -- maybe a double free?
-    // (continue in separate function to improve code generation)
-    return mi_check_is_double_freex(page, block);
-  }
-  return false;
-}
-#else
-static inline bool mi_check_is_double_free(const mi_page_t* page, const mi_block_t* block) {
-  UNUSED(page);
-  UNUSED(block);
-  return false;
-}
-#endif
-
-// ---------------------------------------------------------------------------
-// Check for heap block overflow by setting up padding at the end of the block
-// ---------------------------------------------------------------------------
-
-#if (MI_PADDING>0) && defined(MI_ENCODE_FREELIST)
-static bool mi_page_decode_padding(const mi_page_t* page, const mi_block_t* block, size_t* delta, size_t* bsize) {
-  *bsize = mi_page_usable_block_size(page);
-  const mi_padding_t* const padding = (mi_padding_t*)((uint8_t*)block + *bsize);
-  *delta = padding->delta;
-  return ((uint32_t)mi_ptr_encode(page,block,page->keys) == padding->canary && *delta <= *bsize);
-}
-
-// Return the exact usable size of a block.
-static size_t mi_page_usable_size_of(const mi_page_t* page, const mi_block_t* block) {
-  size_t bsize;
-  size_t delta;
-  bool ok = mi_page_decode_padding(page, block, &delta, &bsize);
-  mi_assert_internal(ok); mi_assert_internal(delta <= bsize);
-  return (ok ? bsize - delta : 0);
-}
-
-static bool mi_verify_padding(const mi_page_t* page, const mi_block_t* block, size_t* size, size_t* wrong) {
-  size_t bsize;
-  size_t delta;
-  bool ok = mi_page_decode_padding(page, block, &delta, &bsize);
-  *size = *wrong = bsize;
-  if (!ok) return false;
-  mi_assert_internal(bsize >= delta);
-  *size = bsize - delta;
-  uint8_t* fill = (uint8_t*)block + bsize - delta;
-  const size_t maxpad = (delta > MI_MAX_ALIGN_SIZE ? MI_MAX_ALIGN_SIZE : delta); // check at most the first N padding bytes
-  for (size_t i = 0; i < maxpad; i++) {
-    if (fill[i] != MI_DEBUG_PADDING) {
-      *wrong = bsize - delta + i;
-      return false;
-    }
-  }
-  return true;
-}
-
-static void mi_check_padding(const mi_page_t* page, const mi_block_t* block) {
-  size_t size;
-  size_t wrong;
-  if (!mi_verify_padding(page,block,&size,&wrong)) {
-    _mi_error_message(EFAULT, "buffer overflow in heap block %p of size %zu: write after %zu bytes\n", block, size, wrong );
-  }
-}
-
-// When a non-thread-local block is freed, it becomes part of the thread delayed free
-// list that is freed later by the owning heap. If the exact usable size is too small to
-// contain the pointer for the delayed list, then shrink the padding (by decreasing delta)
-// so it will later not trigger an overflow error in `mi_free_block`.
-static void mi_padding_shrink(const mi_page_t* page, const mi_block_t* block, const size_t min_size) {
-  size_t bsize;
-  size_t delta;
-  bool ok = mi_page_decode_padding(page, block, &delta, &bsize);
-  mi_assert_internal(ok);
-  if (!ok || (bsize - delta) >= min_size) return;  // usually already enough space
-  mi_assert_internal(bsize >= min_size);
-  if (bsize < min_size) return;  // should never happen
-  size_t new_delta = (bsize - min_size);
-  mi_assert_internal(new_delta < bsize);
-  mi_padding_t* padding = (mi_padding_t*)((uint8_t*)block + bsize);
-  padding->delta = (uint32_t)new_delta;
-}
-#else
-static void mi_check_padding(const mi_page_t* page, const mi_block_t* block) {
-  UNUSED(page);
-  UNUSED(block);
-}
-
-static size_t mi_page_usable_size_of(const mi_page_t* page, const mi_block_t* block) {
-  UNUSED(block);
-  return mi_page_usable_block_size(page);
-}
-
-static void mi_padding_shrink(const mi_page_t* page, const mi_block_t* block, const size_t min_size) {
-  UNUSED(page);
-  UNUSED(block);
-  UNUSED(min_size);
-}
-#endif
-
-// only maintain stats for smaller objects if requested
-#if (MI_STAT>0)
-static void mi_stat_free(const mi_page_t* page, const mi_block_t* block) {
-#if (MI_STAT < 2)  
-  UNUSED(block);
-#endif
-  mi_heap_t* const heap = mi_heap_get_default();
-  const size_t bsize = mi_page_usable_block_size(page);  
-#if (MI_STAT>1)
-  const size_t usize = mi_page_usable_size_of(page, block);
-  mi_heap_stat_decrease(heap, malloc, usize);
-#endif  
-  if (bsize <= MI_LARGE_OBJ_SIZE_MAX) {
-    mi_heap_stat_decrease(heap, normal, bsize);
-#if (MI_STAT > 1)
-    mi_heap_stat_decrease(heap, normal_bins[_mi_bin(bsize)], 1);
-#endif
-  }
-}
-#else
-static void mi_stat_free(const mi_page_t* page, const mi_block_t* block) {
-  UNUSED(page); UNUSED(block);
-}
-#endif
-
-#if (MI_STAT>0)
-// maintain stats for huge objects
-static void mi_stat_huge_free(const mi_page_t* page) {
-  mi_heap_t* const heap = mi_heap_get_default();
-  const size_t bsize = mi_page_block_size(page); // to match stats in `page.c:mi_page_huge_alloc`
-  if (bsize <= MI_HUGE_OBJ_SIZE_MAX) {
-    mi_heap_stat_decrease(heap, huge, bsize);
-  }
-  else {
-    mi_heap_stat_decrease(heap, giant, bsize);
-  }
-}
-#else
-static void mi_stat_huge_free(const mi_page_t* page) {
-  UNUSED(page);
-}
-#endif
-
-// ------------------------------------------------------
-// Free
-// ------------------------------------------------------
-
-// multi-threaded free
-static mi_decl_noinline void _mi_free_block_mt(mi_page_t* page, mi_block_t* block)
-{
-  // The padding check may access the non-thread-owned page for the key values.
-  // that is safe as these are constant and the page won't be freed (as the block is not freed yet).
-  mi_check_padding(page, block);
-  mi_padding_shrink(page, block, sizeof(mi_block_t)); // for small size, ensure we can fit the delayed thread pointers without triggering overflow detection
-  #if (MI_DEBUG!=0)
-  memset(block, MI_DEBUG_FREED, mi_usable_size(block));
-  #endif
-
-  // huge page segments are always abandoned and can be freed immediately
-  mi_segment_t* const segment = _mi_page_segment(page);
-  if (segment->page_kind==MI_PAGE_HUGE) {
-    mi_stat_huge_free(page);
-    _mi_segment_huge_page_free(segment, page, block);
-    return;
-  }
-
-  // Try to put the block on either the page-local thread free list, or the heap delayed free list.
-  mi_thread_free_t tfreex;
-  bool use_delayed;
-  mi_thread_free_t tfree = mi_atomic_load_relaxed(&page->xthread_free);
-  do {
-    use_delayed = (mi_tf_delayed(tfree) == MI_USE_DELAYED_FREE);
-    if (mi_unlikely(use_delayed)) {
-      // unlikely: this only happens on the first concurrent free in a page that is in the full list
-      tfreex = mi_tf_set_delayed(tfree,MI_DELAYED_FREEING);
+    #if MI_DEBUG>3
+    if (p != NULL && zero) {
+      mi_assert_expensive(mi_mem_is_zero(p, size));
     }
-    else {
-      // usual: directly add to page thread_free list
-      mi_block_set_next(page, block, mi_tf_block(tfree));
-      tfreex = mi_tf_set_block(tfree,block);
-    }
-  } while (!mi_atomic_cas_weak_release(&page->xthread_free, &tfree, tfreex));
-
-  if (mi_unlikely(use_delayed)) {
-    // racy read on `heap`, but ok because MI_DELAYED_FREEING is set (see `mi_heap_delete` and `mi_heap_collect_abandon`)
-    mi_heap_t* const heap = (mi_heap_t*)(mi_atomic_load_acquire(&page->xheap)); //mi_page_heap(page);
-    mi_assert_internal(heap != NULL);
-    if (heap != NULL) {
-      // add to the delayed free list of this heap. (do this atomically as the lock only protects heap memory validity)
-      mi_block_t* dfree = mi_atomic_load_ptr_relaxed(mi_block_t, &heap->thread_delayed_free);
-      do {
-        mi_block_set_nextx(heap,block,dfree, heap->keys);
-      } while (!mi_atomic_cas_ptr_weak_release(mi_block_t,&heap->thread_delayed_free, &dfree, block));
-    }
-
-    // and reset the MI_DELAYED_FREEING flag
-    tfree = mi_atomic_load_relaxed(&page->xthread_free);
-    do {
-      tfreex = tfree;
-      mi_assert_internal(mi_tf_delayed(tfree) == MI_DELAYED_FREEING);
-      tfreex = mi_tf_set_delayed(tfree,MI_NO_DELAYED_FREE);
-    } while (!mi_atomic_cas_weak_release(&page->xthread_free, &tfree, tfreex));
-  }
-}
-
-// regular free
-static inline void _mi_free_block(mi_page_t* page, bool local, mi_block_t* block)
-{
-  // and push it on the free list
-  if (mi_likely(local)) {
-    // owning thread can free a block directly
-    if (mi_unlikely(mi_check_is_double_free(page, block))) return;
-    mi_check_padding(page, block);
-    #if (MI_DEBUG!=0)
-    memset(block, MI_DEBUG_FREED, mi_page_block_size(page));
     #endif
-    mi_block_set_next(page, block, page->local_free);
-    page->local_free = block;
-    page->used--;
-    if (mi_unlikely(mi_page_all_free(page))) {
-      _mi_page_retire(page);
-    }
-    else if (mi_unlikely(mi_page_is_in_full(page))) {
-      _mi_page_unfull(page);
-    }
-  }
-  else {
-    _mi_free_block_mt(page,block);
+    return p;
   }
 }
 
-
-// Adjust a block that was allocated aligned, to the actual start of the block in the page.
-mi_block_t* _mi_page_ptr_unalign(const mi_segment_t* segment, const mi_page_t* page, const void* p) {
-  mi_assert_internal(page!=NULL && p!=NULL);
-  const size_t diff   = (uint8_t*)p - _mi_page_start(segment, page, NULL);
-  const size_t adjust = (diff % mi_page_block_size(page));
-  return (mi_block_t*)((uintptr_t)p - adjust);
-}
-
-
-static void mi_decl_noinline mi_free_generic(const mi_segment_t* segment, bool local, void* p) {
-  mi_page_t* const page = _mi_segment_page_of(segment, p);
-  mi_block_t* const block = (mi_page_has_aligned(page) ? _mi_page_ptr_unalign(segment, page, p) : (mi_block_t*)p);
-  mi_stat_free(page, block);
-  _mi_free_block(page, local, block);
+extern inline void* _mi_heap_malloc_zero(mi_heap_t* heap, size_t size, bool zero) mi_attr_noexcept {
+  return _mi_heap_malloc_zero_ex(heap, size, zero, 0);
 }
 
-// Get the segment data belonging to a pointer
-// This is just a single `and` in assembly but does further checks in debug mode
-// (and secure mode) if this was a valid pointer.
-static inline mi_segment_t* mi_checked_ptr_segment(const void* p, const char* msg) 
-{
-  UNUSED(msg);
-#if (MI_DEBUG>0)
-  if (mi_unlikely(((uintptr_t)p & (MI_INTPTR_SIZE - 1)) != 0)) {
-    _mi_error_message(EINVAL, "%s: invalid (unaligned) pointer: %p\n", msg, p);
-    return NULL;
-  }
-#endif
-
-  mi_segment_t* const segment = _mi_ptr_segment(p);
-  if (mi_unlikely(segment == NULL)) return NULL;  // checks also for (p==NULL)
-
-#if (MI_DEBUG>0)
-  if (mi_unlikely(!mi_is_in_heap_region(p))) {
-    _mi_warning_message("%s: pointer might not point to a valid heap region: %p\n"
-      "(this may still be a valid very large allocation (over 64MiB))\n", msg, p);
-    if (mi_likely(_mi_ptr_cookie(segment) == segment->cookie)) {
-      _mi_warning_message("(yes, the previous pointer %p was valid after all)\n", p);
-    }
-  }
-#endif
-#if (MI_DEBUG>0 || MI_SECURE>=4)
-  if (mi_unlikely(_mi_ptr_cookie(segment) != segment->cookie)) {
-    _mi_error_message(EINVAL, "%s: pointer does not point to a valid heap space: %p\n", p);
-  }
-#endif
-  return segment;
-}
-
-
-// Free a block
-void mi_free(void* p) mi_attr_noexcept
-{
-  const mi_segment_t* const segment = mi_checked_ptr_segment(p,"mi_free");
-  if (mi_unlikely(segment == NULL)) return; 
-
-  const uintptr_t tid = _mi_thread_id();
-  mi_page_t* const page = _mi_segment_page_of(segment, p);
-  mi_block_t* const block = (mi_block_t*)p;
-
-  if (mi_likely(tid == segment->thread_id && page->flags.full_aligned == 0)) {  // the thread id matches and it is not a full page, nor has aligned blocks
-    // local, and not full or aligned
-    if (mi_unlikely(mi_check_is_double_free(page,block))) return;
-    mi_check_padding(page, block);
-    mi_stat_free(page, block);
-    #if (MI_DEBUG!=0)
-    memset(block, MI_DEBUG_FREED, mi_page_block_size(page));
-    #endif
-    mi_block_set_next(page, block, page->local_free);
-    page->local_free = block;
-    if (mi_unlikely(--page->used == 0)) {   // using this expression generates better code than: page->used--; if (mi_page_all_free(page))    
-      _mi_page_retire(page);
-    }
-  }
-  else {
-    // non-local, aligned blocks, or a full page; use the more generic path
-    // note: recalc page in generic to improve code generation
-    mi_free_generic(segment, tid == segment->thread_id, p);
-  }
+mi_decl_nodiscard extern inline mi_decl_restrict void* mi_heap_malloc(mi_heap_t* heap, size_t size) mi_attr_noexcept {
+  return _mi_heap_malloc_zero(heap, size, false);
 }
 
-bool _mi_free_delayed_block(mi_block_t* block) {
-  // get segment and page
-  const mi_segment_t* const segment = _mi_ptr_segment(block);
-  mi_assert_internal(_mi_ptr_cookie(segment) == segment->cookie);
-  mi_assert_internal(_mi_thread_id() == segment->thread_id);
-  mi_page_t* const page = _mi_segment_page_of(segment, block);
-
-  // Clear the no-delayed flag so delayed freeing is used again for this page.
-  // This must be done before collecting the free lists on this page -- otherwise
-  // some blocks may end up in the page `thread_free` list with no blocks in the
-  // heap `thread_delayed_free` list which may cause the page to be never freed!
-  // (it would only be freed if we happen to scan it in `mi_page_queue_find_free_ex`)
-  _mi_page_use_delayed_free(page, MI_USE_DELAYED_FREE, false /* dont overwrite never delayed */);
-
-  // collect all other non-local frees to ensure up-to-date `used` count
-  _mi_page_free_collect(page, false);
-
-  // and free the block (possibly freeing the page as well since used is updated)
-  _mi_free_block(page, true, block);
-  return true;
-}
-
-// Bytes available in a block
-static size_t _mi_usable_size(const void* p, const char* msg) mi_attr_noexcept {
-  const mi_segment_t* const segment = mi_checked_ptr_segment(p,msg);
-  if (segment==NULL) return 0;
-  const mi_page_t* const page = _mi_segment_page_of(segment, p);
-  const mi_block_t* block = (const mi_block_t*)p;
-  if (mi_unlikely(mi_page_has_aligned(page))) {
-    block = _mi_page_ptr_unalign(segment, page, p);
-    size_t size = mi_page_usable_size_of(page, block);
-    ptrdiff_t const adjust = (uint8_t*)p - (uint8_t*)block;
-    mi_assert_internal(adjust >= 0 && (size_t)adjust <= size);
-    return (size - adjust);
-  }
-  else {
-    return mi_page_usable_size_of(page, block);
-  }
+mi_decl_nodiscard extern inline mi_decl_restrict void* mi_malloc(size_t size) mi_attr_noexcept {
+  return mi_heap_malloc(mi_prim_get_default_heap(), size);
 }
 
-size_t mi_usable_size(const void* p) mi_attr_noexcept {
-  return _mi_usable_size(p, "mi_usable_size");
+// zero initialized small block
+mi_decl_nodiscard mi_decl_restrict void* mi_zalloc_small(size_t size) mi_attr_noexcept {
+  return mi_heap_malloc_small_zero(mi_prim_get_default_heap(), size, true);
 }
 
-
-// ------------------------------------------------------
-// ensure explicit external inline definitions are emitted!
-// ------------------------------------------------------
-
-#ifdef __cplusplus
-void* _mi_externs[] = {
-  (void*)&_mi_page_malloc,
-  (void*)&mi_malloc,
-  (void*)&mi_malloc_small,
-  (void*)&mi_zalloc_small,
-  (void*)&mi_heap_malloc,
-  (void*)&mi_heap_zalloc,
-  (void*)&mi_heap_malloc_small
-};
-#endif
-
-
-// ------------------------------------------------------
-// Allocation extensions
-// ------------------------------------------------------
-
-void mi_free_size(void* p, size_t size) mi_attr_noexcept {
-  UNUSED_RELEASE(size);
-  mi_assert(p == NULL || size <= _mi_usable_size(p,"mi_free_size"));
-  mi_free(p);
+mi_decl_nodiscard extern inline mi_decl_restrict void* mi_heap_zalloc(mi_heap_t* heap, size_t size) mi_attr_noexcept {
+  return _mi_heap_malloc_zero(heap, size, true);
 }
 
-void mi_free_size_aligned(void* p, size_t size, size_t alignment) mi_attr_noexcept {
-  UNUSED_RELEASE(alignment);
-  mi_assert(((uintptr_t)p % alignment) == 0);
-  mi_free_size(p,size);
+mi_decl_nodiscard mi_decl_restrict void* mi_zalloc(size_t size) mi_attr_noexcept {
+  return mi_heap_zalloc(mi_prim_get_default_heap(),size);
 }
 
-void mi_free_aligned(void* p, size_t alignment) mi_attr_noexcept {
-  UNUSED_RELEASE(alignment);
-  mi_assert(((uintptr_t)p % alignment) == 0);
-  mi_free(p);
-}
 
-extern inline mi_decl_restrict void* mi_heap_calloc(mi_heap_t* heap, size_t count, size_t size) mi_attr_noexcept {
+mi_decl_nodiscard extern inline mi_decl_restrict void* mi_heap_calloc(mi_heap_t* heap, size_t count, size_t size) mi_attr_noexcept {
   size_t total;
   if (mi_count_size_overflow(count,size,&total)) return NULL;
   return mi_heap_zalloc(heap,total);
 }
 
-mi_decl_restrict void* mi_calloc(size_t count, size_t size) mi_attr_noexcept {
-  return mi_heap_calloc(mi_get_default_heap(),count,size);
+mi_decl_nodiscard mi_decl_restrict void* mi_calloc(size_t count, size_t size) mi_attr_noexcept {
+  return mi_heap_calloc(mi_prim_get_default_heap(),count,size);
 }
 
 // Uninitialized `calloc`
-extern mi_decl_restrict void* mi_heap_mallocn(mi_heap_t* heap, size_t count, size_t size) mi_attr_noexcept {
+mi_decl_nodiscard extern mi_decl_restrict void* mi_heap_mallocn(mi_heap_t* heap, size_t count, size_t size) mi_attr_noexcept {
   size_t total;
   if (mi_count_size_overflow(count, size, &total)) return NULL;
   return mi_heap_malloc(heap, total);
 }
 
-mi_decl_restrict void* mi_mallocn(size_t count, size_t size) mi_attr_noexcept {
-  return mi_heap_mallocn(mi_get_default_heap(),count,size);
+mi_decl_nodiscard mi_decl_restrict void* mi_mallocn(size_t count, size_t size) mi_attr_noexcept {
+  return mi_heap_mallocn(mi_prim_get_default_heap(),count,size);
 }
 
-// Expand in place or fail
+// Expand (or shrink) in place (or fail)
 void* mi_expand(void* p, size_t newsize) mi_attr_noexcept {
+  #if MI_PADDING
+  // we do not shrink/expand with padding enabled
+  MI_UNUSED(p); MI_UNUSED(newsize);
+  return NULL;
+  #else
   if (p == NULL) return NULL;
-  size_t size = _mi_usable_size(p,"mi_expand");
+  const size_t size = _mi_usable_size(p,"mi_expand");
   if (newsize > size) return NULL;
   return p; // it fits
+  #endif
 }
 
-void* _mi_heap_realloc_zero(mi_heap_t* heap, void* p, size_t newsize, bool zero) {
-  if (p == NULL) return _mi_heap_malloc_zero(heap,newsize,zero);
-  size_t size = _mi_usable_size(p,"mi_realloc");
-  if (newsize <= size && newsize >= (size / 2)) {
+void* _mi_heap_realloc_zero(mi_heap_t* heap, void* p, size_t newsize, bool zero) mi_attr_noexcept {
+  // if p == NULL then behave as malloc.
+  // else if size == 0 then reallocate to a zero-sized block (and don't return NULL, just as mi_malloc(0)).
+  // (this means that returning NULL always indicates an error, and `p` will not have been freed in that case.)
+  const size_t size = _mi_usable_size(p,"mi_realloc"); // also works if p == NULL (with size 0)
+  if mi_unlikely(newsize <= size && newsize >= (size / 2) && newsize > 0) {  // note: newsize must be > 0 or otherwise we return NULL for realloc(NULL,0)
+    mi_assert_internal(p!=NULL);
+    // todo: do not track as the usable size is still the same in the free; adjust potential padding?
+    // mi_track_resize(p,size,newsize)
+    // if (newsize < size) { mi_track_mem_noaccess((uint8_t*)p + newsize, size - newsize); }
     return p;  // reallocation still fits and not more than 50% waste
   }
   void* newp = mi_heap_malloc(heap,newsize);
-  if (mi_likely(newp != NULL)) {
+  if mi_likely(newp != NULL) {
     if (zero && newsize > size) {
       // also set last word in the previous allocation to zero to ensure any padding is zero-initialized
-      size_t start = (size >= sizeof(intptr_t) ? size - sizeof(intptr_t) : 0);
-      memset((uint8_t*)newp + start, 0, newsize - start);
+      const size_t start = (size >= sizeof(intptr_t) ? size - sizeof(intptr_t) : 0);
+      _mi_memzero((uint8_t*)newp + start, newsize - start);
+    }
+    else if (newsize == 0) {
+      ((uint8_t*)newp)[0] = 0; // work around for applications that expect zero-reallocation to be zero initialized (issue #725)
+    }
+    if mi_likely(p != NULL) {
+      const size_t copysize = (newsize > size ? size : newsize);
+      mi_track_mem_defined(p,copysize);  // _mi_useable_size may be too large for byte precise memory tracking..
+      _mi_memcpy(newp, p, copysize);
+      mi_free(p); // only free the original pointer if successful
     }
-    _mi_memcpy_aligned(newp, p, (newsize > size ? size : newsize));
-    mi_free(p); // only free if successful
   }
   return newp;
 }
 
-void* mi_heap_realloc(mi_heap_t* heap, void* p, size_t newsize) mi_attr_noexcept {
+mi_decl_nodiscard void* mi_heap_realloc(mi_heap_t* heap, void* p, size_t newsize) mi_attr_noexcept {
   return _mi_heap_realloc_zero(heap, p, newsize, false);
 }
 
-void* mi_heap_reallocn(mi_heap_t* heap, void* p, size_t count, size_t size) mi_attr_noexcept {
+mi_decl_nodiscard void* mi_heap_reallocn(mi_heap_t* heap, void* p, size_t count, size_t size) mi_attr_noexcept {
   size_t total;
   if (mi_count_size_overflow(count, size, &total)) return NULL;
   return mi_heap_realloc(heap, p, total);
@@ -647,42 +282,42 @@ void* mi_heap_reallocn(mi_heap_t* heap, void* p, size_t count, size_t size) mi_a
 
 
 // Reallocate but free `p` on errors
-void* mi_heap_reallocf(mi_heap_t* heap, void* p, size_t newsize) mi_attr_noexcept {
+mi_decl_nodiscard void* mi_heap_reallocf(mi_heap_t* heap, void* p, size_t newsize) mi_attr_noexcept {
   void* newp = mi_heap_realloc(heap, p, newsize);
   if (newp==NULL && p!=NULL) mi_free(p);
   return newp;
 }
 
-void* mi_heap_rezalloc(mi_heap_t* heap, void* p, size_t newsize) mi_attr_noexcept {
+mi_decl_nodiscard void* mi_heap_rezalloc(mi_heap_t* heap, void* p, size_t newsize) mi_attr_noexcept {
   return _mi_heap_realloc_zero(heap, p, newsize, true);
 }
 
-void* mi_heap_recalloc(mi_heap_t* heap, void* p, size_t count, size_t size) mi_attr_noexcept {
+mi_decl_nodiscard void* mi_heap_recalloc(mi_heap_t* heap, void* p, size_t count, size_t size) mi_attr_noexcept {
   size_t total;
   if (mi_count_size_overflow(count, size, &total)) return NULL;
   return mi_heap_rezalloc(heap, p, total);
 }
 
 
-void* mi_realloc(void* p, size_t newsize) mi_attr_noexcept {
-  return mi_heap_realloc(mi_get_default_heap(),p,newsize);
+mi_decl_nodiscard void* mi_realloc(void* p, size_t newsize) mi_attr_noexcept {
+  return mi_heap_realloc(mi_prim_get_default_heap(),p,newsize);
 }
 
-void* mi_reallocn(void* p, size_t count, size_t size) mi_attr_noexcept {
-  return mi_heap_reallocn(mi_get_default_heap(),p,count,size);
+mi_decl_nodiscard void* mi_reallocn(void* p, size_t count, size_t size) mi_attr_noexcept {
+  return mi_heap_reallocn(mi_prim_get_default_heap(),p,count,size);
 }
 
 // Reallocate but free `p` on errors
-void* mi_reallocf(void* p, size_t newsize) mi_attr_noexcept {
-  return mi_heap_reallocf(mi_get_default_heap(),p,newsize);
+mi_decl_nodiscard void* mi_reallocf(void* p, size_t newsize) mi_attr_noexcept {
+  return mi_heap_reallocf(mi_prim_get_default_heap(),p,newsize);
 }
 
-void* mi_rezalloc(void* p, size_t newsize) mi_attr_noexcept {
-  return mi_heap_rezalloc(mi_get_default_heap(), p, newsize);
+mi_decl_nodiscard void* mi_rezalloc(void* p, size_t newsize) mi_attr_noexcept {
+  return mi_heap_rezalloc(mi_prim_get_default_heap(), p, newsize);
 }
 
-void* mi_recalloc(void* p, size_t count, size_t size) mi_attr_noexcept {
-  return mi_heap_recalloc(mi_get_default_heap(), p, count, size);
+mi_decl_nodiscard void* mi_recalloc(void* p, size_t count, size_t size) mi_attr_noexcept {
+  return mi_heap_recalloc(mi_prim_get_default_heap(), p, count, size);
 }
 
 
@@ -692,33 +327,33 @@ void* mi_recalloc(void* p, size_t count, size_t size) mi_attr_noexcept {
 // ------------------------------------------------------
 
 // `strdup` using mi_malloc
-mi_decl_restrict char* mi_heap_strdup(mi_heap_t* heap, const char* s) mi_attr_noexcept {
+mi_decl_nodiscard mi_decl_restrict char* mi_heap_strdup(mi_heap_t* heap, const char* s) mi_attr_noexcept {
   if (s == NULL) return NULL;
-  size_t n = strlen(s);
-  char* t = (char*)mi_heap_malloc(heap,n+1);
-  if (t != NULL) _mi_memcpy(t, s, n + 1);
+  size_t len = _mi_strlen(s);
+  char* t = (char*)mi_heap_malloc(heap,len+1);
+  if (t == NULL) return NULL;
+  _mi_memcpy(t, s, len);
+  t[len] = 0;
   return t;
 }
 
-mi_decl_restrict char* mi_strdup(const char* s) mi_attr_noexcept {
-  return mi_heap_strdup(mi_get_default_heap(), s);
+mi_decl_nodiscard mi_decl_restrict char* mi_strdup(const char* s) mi_attr_noexcept {
+  return mi_heap_strdup(mi_prim_get_default_heap(), s);
 }
 
 // `strndup` using mi_malloc
-mi_decl_restrict char* mi_heap_strndup(mi_heap_t* heap, const char* s, size_t n) mi_attr_noexcept {
+mi_decl_nodiscard mi_decl_restrict char* mi_heap_strndup(mi_heap_t* heap, const char* s, size_t n) mi_attr_noexcept {
   if (s == NULL) return NULL;
-  const char* end = (const char*)memchr(s, 0, n);  // find end of string in the first `n` characters (returns NULL if not found)
-  const size_t m = (end != NULL ? (size_t)(end - s) : n);  // `m` is the minimum of `n` or the end-of-string
-  mi_assert_internal(m <= n);
-  char* t = (char*)mi_heap_malloc(heap, m+1);
+  const size_t len = _mi_strnlen(s,n);  // len <= n
+  char* t = (char*)mi_heap_malloc(heap, len+1);
   if (t == NULL) return NULL;
-  _mi_memcpy(t, s, m);
-  t[m] = 0;
+  _mi_memcpy(t, s, len);
+  t[len] = 0;
   return t;
 }
 
-mi_decl_restrict char* mi_strndup(const char* s, size_t n) mi_attr_noexcept {
-  return mi_heap_strndup(mi_get_default_heap(),s,n);
+mi_decl_nodiscard mi_decl_restrict char* mi_strndup(const char* s, size_t n) mi_attr_noexcept {
+  return mi_heap_strndup(mi_prim_get_default_heap(),s,n);
 }
 
 #ifndef __wasi__
@@ -728,7 +363,7 @@ mi_decl_restrict char* mi_strndup(const char* s, size_t n) mi_attr_noexcept {
 #define PATH_MAX MAX_PATH
 #endif
 #include <windows.h>
-mi_decl_restrict char* mi_heap_realpath(mi_heap_t* heap, const char* fname, char* resolved_name) mi_attr_noexcept {
+mi_decl_nodiscard mi_decl_restrict char* mi_heap_realpath(mi_heap_t* heap, const char* fname, char* resolved_name) mi_attr_noexcept {
   // todo: use GetFullPathNameW to allow longer file names
   char buf[PATH_MAX];
   DWORD res = GetFullPathNameA(fname, PATH_MAX, (resolved_name == NULL ? buf : resolved_name), NULL);
@@ -746,8 +381,9 @@ mi_decl_restrict char* mi_heap_realpath(mi_heap_t* heap, const char* fname, char
   }
 }
 #else
+/*
 #include <unistd.h>  // pathconf
-static size_t mi_path_max() {
+static size_t mi_path_max(void) {
   static size_t path_max = 0;
   if (path_max <= 0) {
     long m = pathconf("/",_PC_PATH_MAX);
@@ -757,25 +393,37 @@ static size_t mi_path_max() {
   }
   return path_max;
 }
-
+*/
 char* mi_heap_realpath(mi_heap_t* heap, const char* fname, char* resolved_name) mi_attr_noexcept {
   if (resolved_name != NULL) {
     return realpath(fname,resolved_name);
   }
   else {
-    size_t n  = mi_path_max();
+    char* rname = realpath(fname, NULL);
+    if (rname == NULL) return NULL;
+    char* result = mi_heap_strdup(heap, rname);
+    mi_cfree(rname);  // use checked free (which may be redirected to our free but that's ok)
+    // note: with ASAN realpath is intercepted and mi_cfree may leak the returned pointer :-(
+    return result;
+  }
+  /*
+    const size_t n  = mi_path_max();
     char* buf = (char*)mi_malloc(n+1);
-    if (buf==NULL) return NULL;
+    if (buf == NULL) {
+      errno = ENOMEM;
+      return NULL;
+    }
     char* rname  = realpath(fname,buf);
     char* result = mi_heap_strndup(heap,rname,n); // ok if `rname==NULL`
     mi_free(buf);
     return result;
   }
+  */
 }
 #endif
 
-mi_decl_restrict char* mi_realpath(const char* fname, char* resolved_name) mi_attr_noexcept {
-  return mi_heap_realpath(mi_get_default_heap(),fname,resolved_name);
+mi_decl_nodiscard mi_decl_restrict char* mi_realpath(const char* fname, char* resolved_name) mi_attr_noexcept {
+  return mi_heap_realpath(mi_prim_get_default_heap(),fname,resolved_name);
 }
 #endif
 
@@ -796,9 +444,16 @@ static bool mi_try_new_handler(bool nothrow) {
   #else
     std::new_handler h = std::set_new_handler();
     std::set_new_handler(h);
-  #endif  
+  #endif
   if (h==NULL) {
-    if (!nothrow) throw std::bad_alloc();
+    _mi_error_message(ENOMEM, "out of memory in 'new'");
+    #if defined(_CPPUNWIND) || defined(__cpp_exceptions)  // exceptions are not always enabled
+    if (!nothrow) {
+      throw std::bad_alloc();
+    }
+    #else
+    MI_UNUSED(nothrow);
+    #endif
     return false;
   }
   else {
@@ -807,13 +462,13 @@ static bool mi_try_new_handler(bool nothrow) {
   }
 }
 #else
-typedef void (*std_new_handler_t)();
+typedef void (*std_new_handler_t)(void);
 
-#if (defined(__GNUC__) || defined(__clang__))
-std_new_handler_t __attribute((weak)) _ZSt15get_new_handlerv() {
+#if (defined(__GNUC__) || (defined(__clang__) && !defined(_MSC_VER)))  // exclude clang-cl, see issue #631
+std_new_handler_t __attribute__((weak)) _ZSt15get_new_handlerv(void) {
   return NULL;
 }
-static std_new_handler_t mi_get_new_handler() {
+static std_new_handler_t mi_get_new_handler(void) {
   return _ZSt15get_new_handlerv();
 }
 #else
@@ -826,7 +481,10 @@ static std_new_handler_t mi_get_new_handler() {
 static bool mi_try_new_handler(bool nothrow) {
   std_new_handler_t h = mi_get_new_handler();
   if (h==NULL) {
-    if (!nothrow) exit(ENOMEM);  // cannot throw in plain C, use exit as we are out of memory anyway.
+    _mi_error_message(ENOMEM, "out of memory in 'new'");
+    if (!nothrow) {
+      abort();  // cannot throw in plain C, use abort
+    }
     return false;
   }
   else {
@@ -836,27 +494,53 @@ static bool mi_try_new_handler(bool nothrow) {
 }
 #endif
 
-static mi_decl_noinline void* mi_try_new(size_t size, bool nothrow ) {
+mi_decl_export mi_decl_noinline void* mi_heap_try_new(mi_heap_t* heap, size_t size, bool nothrow ) {
   void* p = NULL;
   while(p == NULL && mi_try_new_handler(nothrow)) {
-    p = mi_malloc(size);
+    p = mi_heap_malloc(heap,size);
   }
   return p;
 }
 
-mi_decl_restrict void* mi_new(size_t size) {
-  void* p = mi_malloc(size);
-  if (mi_unlikely(p == NULL)) return mi_try_new(size,false);
+static mi_decl_noinline void* mi_try_new(size_t size, bool nothrow) {
+  return mi_heap_try_new(mi_prim_get_default_heap(), size, nothrow);
+}
+
+
+mi_decl_nodiscard mi_decl_restrict void* mi_heap_alloc_new(mi_heap_t* heap, size_t size) {
+  void* p = mi_heap_malloc(heap,size);
+  if mi_unlikely(p == NULL) return mi_heap_try_new(heap, size, false);
   return p;
 }
 
-mi_decl_restrict void* mi_new_nothrow(size_t size) mi_attr_noexcept {
+mi_decl_nodiscard mi_decl_restrict void* mi_new(size_t size) {
+  return mi_heap_alloc_new(mi_prim_get_default_heap(), size);
+}
+
+
+mi_decl_nodiscard mi_decl_restrict void* mi_heap_alloc_new_n(mi_heap_t* heap, size_t count, size_t size) {
+  size_t total;
+  if mi_unlikely(mi_count_size_overflow(count, size, &total)) {
+    mi_try_new_handler(false);  // on overflow we invoke the try_new_handler once to potentially throw std::bad_alloc
+    return NULL;
+  }
+  else {
+    return mi_heap_alloc_new(heap,total);
+  }
+}
+
+mi_decl_nodiscard mi_decl_restrict void* mi_new_n(size_t count, size_t size) {
+  return mi_heap_alloc_new_n(mi_prim_get_default_heap(), size, count);
+}
+
+
+mi_decl_nodiscard mi_decl_restrict void* mi_new_nothrow(size_t size) mi_attr_noexcept {
   void* p = mi_malloc(size);
-  if (mi_unlikely(p == NULL)) return mi_try_new(size, true);
+  if mi_unlikely(p == NULL) return mi_try_new(size, true);
   return p;
 }
 
-mi_decl_restrict void* mi_new_aligned(size_t size, size_t alignment) {
+mi_decl_nodiscard mi_decl_restrict void* mi_new_aligned(size_t size, size_t alignment) {
   void* p;
   do {
     p = mi_malloc_aligned(size, alignment);
@@ -865,7 +549,7 @@ mi_decl_restrict void* mi_new_aligned(size_t size, size_t alignment) {
   return p;
 }
 
-mi_decl_restrict void* mi_new_aligned_nothrow(size_t size, size_t alignment) mi_attr_noexcept {
+mi_decl_nodiscard mi_decl_restrict void* mi_new_aligned_nothrow(size_t size, size_t alignment) mi_attr_noexcept {
   void* p;
   do {
     p = mi_malloc_aligned(size, alignment);
@@ -874,18 +558,7 @@ mi_decl_restrict void* mi_new_aligned_nothrow(size_t size, size_t alignment) mi_
   return p;
 }
 
-mi_decl_restrict void* mi_new_n(size_t count, size_t size) {
-  size_t total;
-  if (mi_unlikely(mi_count_size_overflow(count, size, &total))) {
-    mi_try_new_handler(false);  // on overflow we invoke the try_new_handler once to potentially throw std::bad_alloc
-    return NULL;
-  }
-  else {
-    return mi_new(total);
-  }
-}
-
-void* mi_new_realloc(void* p, size_t newsize) {
+mi_decl_nodiscard void* mi_new_realloc(void* p, size_t newsize) {
   void* q;
   do {
     q = mi_realloc(p, newsize);
@@ -893,9 +566,9 @@ void* mi_new_realloc(void* p, size_t newsize) {
   return q;
 }
 
-void* mi_new_reallocn(void* p, size_t newcount, size_t size) {
+mi_decl_nodiscard void* mi_new_reallocn(void* p, size_t newcount, size_t size) {
   size_t total;
-  if (mi_unlikely(mi_count_size_overflow(newcount, size, &total))) {
+  if mi_unlikely(mi_count_size_overflow(newcount, size, &total)) {
     mi_try_new_handler(false);  // on overflow we invoke the try_new_handler once to potentially throw std::bad_alloc
     return NULL;
   }
@@ -903,3 +576,23 @@ void* mi_new_reallocn(void* p, size_t newcount, size_t size) {
     return mi_new_realloc(p, total);
   }
 }
+
+// ------------------------------------------------------
+// ensure explicit external inline definitions are emitted!
+// ------------------------------------------------------
+
+#ifdef __cplusplus
+void* _mi_externs[] = {
+  (void*)&_mi_page_malloc,
+  (void*)&_mi_heap_malloc_zero,
+  (void*)&_mi_heap_malloc_zero_ex,
+  (void*)&mi_malloc,
+  (void*)&mi_malloc_small,
+  (void*)&mi_zalloc_small,
+  (void*)&mi_heap_malloc,
+  (void*)&mi_heap_zalloc,
+  (void*)&mi_heap_malloc_small
+  // (void*)&mi_heap_alloc_new,
+  // (void*)&mi_heap_alloc_new_n
+};
+#endif
diff --git a/contrib/libs/mimalloc/src/arena.c b/contrib/libs/mimalloc/src/arena.c
index 0e6615a420f1..25ce56ec8fe9 100644
--- a/contrib/libs/mimalloc/src/arena.c
+++ b/contrib/libs/mimalloc/src/arena.c
@@ -1,5 +1,5 @@
 /* ----------------------------------------------------------------------------
-Copyright (c) 2019-2021, Microsoft Research, Daan Leijen
+Copyright (c) 2019-2023, Microsoft Research, Daan Leijen
 This is free software; you can redistribute it and/or modify it under the
 terms of the MIT license. A copy of the license can be found in the file
 "LICENSE" at the root of this distribution.
@@ -7,107 +7,214 @@ terms of the MIT license. A copy of the license can be found in the file
 
 /* ----------------------------------------------------------------------------
 "Arenas" are fixed area's of OS memory from which we can allocate
-large blocks (>= MI_ARENA_BLOCK_SIZE, 32MiB).
+large blocks (>= MI_ARENA_MIN_BLOCK_SIZE, 4MiB).
 In contrast to the rest of mimalloc, the arenas are shared between
 threads and need to be accessed using atomic operations.
 
-Currently arenas are only used to for huge OS page (1GiB) reservations,
-otherwise it delegates to direct allocation from the OS.
-In the future, we can expose an API to manually add more kinds of arenas
-which is sometimes needed for embedded devices or shared memory for example.
-(We can also employ this with WASI or `sbrk` systems to reserve large arenas
- on demand and be able to reuse them efficiently).
-
-The arena allocation needs to be thread safe and we use an atomic
-bitmap to allocate. The current implementation of the bitmap can
-only do this within a field (`uintptr_t`) so we can allocate at most
-blocks of 2GiB (64*32MiB) and no object can cross the boundary. This
-can lead to fragmentation but fortunately most objects will be regions
-of 256MiB in practice.
+Arenas are used to for huge OS page (1GiB) reservations or for reserving
+OS memory upfront which can be improve performance or is sometimes needed
+on embedded devices. We can also employ this with WASI or `sbrk` systems
+to reserve large arenas upfront and be able to reuse the memory more effectively.
+
+The arena allocation needs to be thread safe and we use an atomic bitmap to allocate.
 -----------------------------------------------------------------------------*/
 #include "mimalloc.h"
-#include "mimalloc-internal.h"
-#include "mimalloc-atomic.h"
+#include "mimalloc/internal.h"
+#include "mimalloc/atomic.h"
 
 #include <string.h>  // memset
-#include <errno.h> // ENOMEM
+#include <errno.h>   // ENOMEM
 
 #include "bitmap.h"  // atomic bitmap
 
-
-// os.c
-void* _mi_os_alloc_aligned(size_t size, size_t alignment, bool commit, bool* large, mi_stats_t* stats);
-void  _mi_os_free_ex(void* p, size_t size, bool was_committed, mi_stats_t* stats);
-void  _mi_os_free(void* p, size_t size, mi_stats_t* stats);
-
-void* _mi_os_alloc_huge_os_pages(size_t pages, int numa_node, mi_msecs_t max_secs, size_t* pages_reserved, size_t* psize);
-void  _mi_os_free_huge_pages(void* p, size_t size, mi_stats_t* stats);
-
-bool  _mi_os_commit(void* p, size_t size, bool* is_zero, mi_stats_t* stats);
-bool  _mi_os_decommit(void* addr, size_t size, mi_stats_t* stats);
-
 /* -----------------------------------------------------------
   Arena allocation
 ----------------------------------------------------------- */
 
-#define MI_SEGMENT_ALIGN      MI_SEGMENT_SIZE
-#define MI_ARENA_BLOCK_SIZE   (4*MI_SEGMENT_ALIGN)     // 32MiB
-#define MI_ARENA_MIN_OBJ_SIZE (MI_ARENA_BLOCK_SIZE/2)  // 16MiB
-#define MI_MAX_ARENAS         (64)                     // not more than 256 (since we use 8 bits in the memid)
+// Block info: bit 0 contains the `in_use` bit, the upper bits the
+// size in count of arena blocks.
+typedef uintptr_t mi_block_info_t;
+#define MI_ARENA_BLOCK_SIZE   (MI_SEGMENT_SIZE)        // 64MiB  (must be at least MI_SEGMENT_ALIGN)
+#define MI_ARENA_MIN_OBJ_SIZE (MI_ARENA_BLOCK_SIZE/2)  // 32MiB
+#define MI_MAX_ARENAS         (112)                    // not more than 126 (since we use 7 bits in the memid and an arena index + 1)
 
 // A memory arena descriptor
 typedef struct mi_arena_s {
+  mi_arena_id_t id;                       // arena id; 0 for non-specific
+  mi_memid_t memid;                       // memid of the memory area
   _Atomic(uint8_t*) start;                // the start of the memory area
   size_t   block_count;                   // size of the area in arena blocks (of `MI_ARENA_BLOCK_SIZE`)
   size_t   field_count;                   // number of bitmap fields (where `field_count * MI_BITMAP_FIELD_BITS >= block_count`)
+  size_t   meta_size;                     // size of the arena structure itself (including its bitmaps)
+  mi_memid_t meta_memid;                  // memid of the arena structure itself (OS or static allocation)
   int      numa_node;                     // associated NUMA node
-  bool     is_zero_init;                  // is the arena zero initialized?
-  bool     is_committed;                  // is the memory fully committed? (if so, block_committed == NULL)
-  bool     is_large;                      // large- or huge OS pages (always committed)
-  _Atomic(uintptr_t) search_idx;          // optimization to start the search for free blocks
+  bool     exclusive;                     // only allow allocations if specifically for this arena
+  bool     is_large;                      // memory area consists of large- or huge OS pages (always committed)
+  _Atomic(size_t) search_idx;             // optimization to start the search for free blocks
+  _Atomic(mi_msecs_t) purge_expire;       // expiration time when blocks should be decommitted from `blocks_decommit`.  
   mi_bitmap_field_t* blocks_dirty;        // are the blocks potentially non-zero?
-  mi_bitmap_field_t* blocks_committed;    // if `!is_committed`, are the blocks committed?
+  mi_bitmap_field_t* blocks_committed;    // are the blocks committed? (can be NULL for memory that cannot be decommitted)
+  mi_bitmap_field_t* blocks_purge;        // blocks that can be (reset) decommitted. (can be NULL for memory that cannot be (reset) decommitted)
+  mi_bitmap_field_t* blocks_abandoned;    // blocks that start with an abandoned segment. (This crosses API's but it is convenient to have here)
   mi_bitmap_field_t  blocks_inuse[1];     // in-place bitmap of in-use blocks (of size `field_count`)
+  // do not add further fields here as the dirty, committed, purged, and abandoned bitmaps follow the inuse bitmap fields.
 } mi_arena_t;
 
 
 // The available arenas
 static mi_decl_cache_align _Atomic(mi_arena_t*) mi_arenas[MI_MAX_ARENAS];
-static mi_decl_cache_align _Atomic(uintptr_t)   mi_arena_count; // = 0
+static mi_decl_cache_align _Atomic(size_t)      mi_arena_count; // = 0
+
 
+//static bool mi_manage_os_memory_ex2(void* start, size_t size, bool is_large, int numa_node, bool exclusive, mi_memid_t memid, mi_arena_id_t* arena_id) mi_attr_noexcept;
 
 /* -----------------------------------------------------------
-  Arena allocations get a memory id where the lower 8 bits are
-  the arena index +1, and the upper bits the block index.
+  Arena id's
+  id = arena_index + 1
 ----------------------------------------------------------- */
 
-// Use `0` as a special id for direct OS allocated memory.
-#define MI_MEMID_OS   0
+static size_t mi_arena_id_index(mi_arena_id_t id) {
+  return (size_t)(id <= 0 ? MI_MAX_ARENAS : id - 1);
+}
+
+static mi_arena_id_t mi_arena_id_create(size_t arena_index) {
+  mi_assert_internal(arena_index < MI_MAX_ARENAS);
+  return (int)arena_index + 1;
+}
+
+mi_arena_id_t _mi_arena_id_none(void) {
+  return 0;
+}
 
-static size_t mi_arena_id_create(size_t arena_index, mi_bitmap_index_t bitmap_index) {
-  mi_assert_internal(arena_index < 0xFE);
-  mi_assert_internal(((bitmap_index << 8) >> 8) == bitmap_index); // no overflow?
-  return ((bitmap_index << 8) | ((arena_index+1) & 0xFF));
+static bool mi_arena_id_is_suitable(mi_arena_id_t arena_id, bool arena_is_exclusive, mi_arena_id_t req_arena_id) {
+  return ((!arena_is_exclusive && req_arena_id == _mi_arena_id_none()) ||
+          (arena_id == req_arena_id));
 }
 
-static void mi_arena_id_indices(size_t memid, size_t* arena_index, mi_bitmap_index_t* bitmap_index) {
-  mi_assert_internal(memid != MI_MEMID_OS);
-  *arena_index = (memid & 0xFF) - 1;
-  *bitmap_index = (memid >> 8);
+bool _mi_arena_memid_is_suitable(mi_memid_t memid, mi_arena_id_t request_arena_id) {
+  if (memid.memkind == MI_MEM_ARENA) {
+    return mi_arena_id_is_suitable(memid.mem.arena.id, memid.mem.arena.is_exclusive, request_arena_id);
+  }
+  else {
+    return mi_arena_id_is_suitable(_mi_arena_id_none(), false, request_arena_id);
+  }
 }
 
+
+/* -----------------------------------------------------------
+  Arena allocations get a (currently) 16-bit memory id where the
+  lower 8 bits are the arena id, and the upper bits the block index.
+----------------------------------------------------------- */
+
 static size_t mi_block_count_of_size(size_t size) {
   return _mi_divide_up(size, MI_ARENA_BLOCK_SIZE);
 }
 
+static size_t mi_arena_block_size(size_t bcount) {
+  return (bcount * MI_ARENA_BLOCK_SIZE);
+}
+
+static size_t mi_arena_size(mi_arena_t* arena) {
+  return mi_arena_block_size(arena->block_count);
+}
+
+static mi_memid_t mi_memid_create_arena(mi_arena_id_t id, bool is_exclusive, mi_bitmap_index_t bitmap_index) {
+  mi_memid_t memid = _mi_memid_create(MI_MEM_ARENA);
+  memid.mem.arena.id = id;
+  memid.mem.arena.block_index = bitmap_index;
+  memid.mem.arena.is_exclusive = is_exclusive;
+  return memid;
+}
+
+static bool mi_arena_memid_indices(mi_memid_t memid, size_t* arena_index, mi_bitmap_index_t* bitmap_index) {
+  mi_assert_internal(memid.memkind == MI_MEM_ARENA);
+  *arena_index = mi_arena_id_index(memid.mem.arena.id);
+  *bitmap_index = memid.mem.arena.block_index;
+  return memid.mem.arena.is_exclusive;
+}
+
+
+
+/* -----------------------------------------------------------
+  Special static area for mimalloc internal structures
+  to avoid OS calls (for example, for the arena metadata)
+----------------------------------------------------------- */
+
+#define MI_ARENA_STATIC_MAX  (MI_INTPTR_SIZE*MI_KiB)  // 8 KiB on 64-bit
+
+static mi_decl_cache_align uint8_t mi_arena_static[MI_ARENA_STATIC_MAX];  // must be cache aligned, see issue #895
+static mi_decl_cache_align _Atomic(size_t) mi_arena_static_top;
+
+static void* mi_arena_static_zalloc(size_t size, size_t alignment, mi_memid_t* memid) {
+  *memid = _mi_memid_none();
+  if (size == 0 || size > MI_ARENA_STATIC_MAX) return NULL;
+  const size_t toplow = mi_atomic_load_relaxed(&mi_arena_static_top);
+  if ((toplow + size) > MI_ARENA_STATIC_MAX) return NULL;
+
+  // try to claim space
+  if (alignment < MI_MAX_ALIGN_SIZE) { alignment = MI_MAX_ALIGN_SIZE; }
+  const size_t oversize = size + alignment - 1;
+  if (toplow + oversize > MI_ARENA_STATIC_MAX) return NULL;
+  const size_t oldtop = mi_atomic_add_acq_rel(&mi_arena_static_top, oversize);
+  size_t top = oldtop + oversize;
+  if (top > MI_ARENA_STATIC_MAX) {
+    // try to roll back, ok if this fails
+    mi_atomic_cas_strong_acq_rel(&mi_arena_static_top, &top, oldtop);
+    return NULL;
+  }
+
+  // success
+  *memid = _mi_memid_create(MI_MEM_STATIC);
+  memid->initially_zero = true;
+  const size_t start = _mi_align_up(oldtop, alignment);
+  uint8_t* const p = &mi_arena_static[start];
+  _mi_memzero_aligned(p, size);
+  return p;
+}
+
+static void* mi_arena_meta_zalloc(size_t size, mi_memid_t* memid, mi_stats_t* stats) {
+  *memid = _mi_memid_none();
+
+  // try static
+  void* p = mi_arena_static_zalloc(size, MI_MAX_ALIGN_SIZE, memid);
+  if (p != NULL) return p;
+
+  // or fall back to the OS
+  p = _mi_os_alloc(size, memid, stats);
+  if (p == NULL) return NULL;
+
+  // zero the OS memory if needed
+  if (!memid->initially_zero) {
+    _mi_memzero_aligned(p, size);
+    memid->initially_zero = true;
+  }
+  return p;
+}
+
+static void mi_arena_meta_free(void* p, mi_memid_t memid, size_t size, mi_stats_t* stats) {
+  if (mi_memkind_is_os(memid.memkind)) {
+    _mi_os_free(p, size, memid, stats);
+  }
+  else {
+    mi_assert(memid.memkind == MI_MEM_STATIC);
+  }
+}
+
+static void* mi_arena_block_start(mi_arena_t* arena, mi_bitmap_index_t bindex) {
+  return (arena->start + mi_arena_block_size(mi_bitmap_index_bit(bindex)));
+}
+
+
 /* -----------------------------------------------------------
   Thread safe allocation in an arena
 ----------------------------------------------------------- */
-static bool mi_arena_alloc(mi_arena_t* arena, size_t blocks, mi_bitmap_index_t* bitmap_idx)
+
+// claim the `blocks_inuse` bits
+static bool mi_arena_try_claim(mi_arena_t* arena, size_t blocks, mi_bitmap_index_t* bitmap_idx, mi_stats_t* stats)
 {
-  size_t idx = mi_atomic_load_acquire(&arena->search_idx);  // start from last search
-  if (_mi_bitmap_try_find_from_claim_across(arena->blocks_inuse, arena->field_count, idx, blocks, bitmap_idx)) {
-    mi_atomic_store_release(&arena->search_idx, idx);  // start search from here next time
+  size_t idx = 0; // mi_atomic_load_relaxed(&arena->search_idx);  // start from last search; ok to be relaxed as the exact start does not matter
+  if (_mi_bitmap_try_find_from_claim_across(arena->blocks_inuse, arena->field_count, idx, blocks, bitmap_idx, stats)) {
+    mi_atomic_store_relaxed(&arena->search_idx, mi_bitmap_index_field(*bitmap_idx));  // start search from found location next time around
     return true;
   };
   return false;
@@ -118,194 +225,713 @@ static bool mi_arena_alloc(mi_arena_t* arena, size_t blocks, mi_bitmap_index_t*
   Arena Allocation
 ----------------------------------------------------------- */
 
-static void* mi_arena_alloc_from(mi_arena_t* arena, size_t arena_index, size_t needed_bcount,
-                                 bool* commit, bool* large, bool* is_pinned, bool* is_zero, size_t* memid, mi_os_tld_t* tld)
+static mi_decl_noinline void* mi_arena_try_alloc_at(mi_arena_t* arena, size_t arena_index, size_t needed_bcount,
+                                                    bool commit, mi_memid_t* memid, mi_os_tld_t* tld)
 {
+  MI_UNUSED(arena_index);
+  mi_assert_internal(mi_arena_id_index(arena->id) == arena_index);
+
   mi_bitmap_index_t bitmap_index;
-  if (!mi_arena_alloc(arena, needed_bcount, &bitmap_index)) return NULL;
-
-  // claimed it! set the dirty bits (todo: no need for an atomic op here?)
-  void* p    = arena->start + (mi_bitmap_index_bit(bitmap_index)*MI_ARENA_BLOCK_SIZE);
-  *memid     = mi_arena_id_create(arena_index, bitmap_index);
-  *is_zero   = _mi_bitmap_claim_across(arena->blocks_dirty, arena->field_count, needed_bcount, bitmap_index, NULL);
-  *large     = arena->is_large;
-  *is_pinned = (arena->is_large || arena->is_committed);
-  if (arena->is_committed) {
+  if (!mi_arena_try_claim(arena, needed_bcount, &bitmap_index, tld->stats)) return NULL;
+
+  // claimed it!
+  void* p = mi_arena_block_start(arena, bitmap_index);
+  *memid = mi_memid_create_arena(arena->id, arena->exclusive, bitmap_index);
+  memid->is_pinned = arena->memid.is_pinned;
+
+  // none of the claimed blocks should be scheduled for a decommit
+  if (arena->blocks_purge != NULL) {
+    // this is thread safe as a potential purge only decommits parts that are not yet claimed as used (in `blocks_inuse`).
+    _mi_bitmap_unclaim_across(arena->blocks_purge, arena->field_count, needed_bcount, bitmap_index);
+  }
+
+  // set the dirty bits (todo: no need for an atomic op here?)
+  if (arena->memid.initially_zero && arena->blocks_dirty != NULL) {
+    memid->initially_zero = _mi_bitmap_claim_across(arena->blocks_dirty, arena->field_count, needed_bcount, bitmap_index, NULL);
+  }
+
+  // set commit state
+  if (arena->blocks_committed == NULL) {
     // always committed
-    *commit = true;
+    memid->initially_committed = true;
   }
-  else if (*commit) {
-    // arena not committed as a whole, but commit requested: ensure commit now
+  else if (commit) {
+    // commit requested, but the range may not be committed as a whole: ensure it is committed now
+    memid->initially_committed = true;
     bool any_uncommitted;
     _mi_bitmap_claim_across(arena->blocks_committed, arena->field_count, needed_bcount, bitmap_index, &any_uncommitted);
     if (any_uncommitted) {
-      bool commit_zero;
-      _mi_os_commit(p, needed_bcount * MI_ARENA_BLOCK_SIZE, &commit_zero, tld->stats);
-      if (commit_zero) *is_zero = true;
+      bool commit_zero = false;
+      if (!_mi_os_commit(p, mi_arena_block_size(needed_bcount), &commit_zero, tld->stats)) {
+        memid->initially_committed = false;
+      }
+      else {
+        if (commit_zero) { memid->initially_zero = true; }
+      }
     }
   }
   else {
     // no need to commit, but check if already fully committed
-    *commit = _mi_bitmap_is_claimed_across(arena->blocks_committed, arena->field_count, needed_bcount, bitmap_index);
+    memid->initially_committed = _mi_bitmap_is_claimed_across(arena->blocks_committed, arena->field_count, needed_bcount, bitmap_index);
   }
+
   return p;
 }
 
-void* _mi_arena_alloc_aligned(size_t size, size_t alignment, bool* commit, bool* large, bool* is_pinned, bool* is_zero,
-                              size_t* memid, mi_os_tld_t* tld)
+// allocate in a speficic arena
+static void* mi_arena_try_alloc_at_id(mi_arena_id_t arena_id, bool match_numa_node, int numa_node, size_t size, size_t alignment,
+                                       bool commit, bool allow_large, mi_arena_id_t req_arena_id, mi_memid_t* memid, mi_os_tld_t* tld )
 {
-  mi_assert_internal(commit != NULL && is_pinned != NULL && is_zero != NULL && memid != NULL && tld != NULL);
-  mi_assert_internal(size > 0);
-  *memid   = MI_MEMID_OS;
-  *is_zero = false;
-  *is_pinned = false;
-
-  // try to allocate in an arena if the alignment is small enough
-  // and the object is not too large or too small.
-  if (alignment <= MI_SEGMENT_ALIGN &&
-      size >= MI_ARENA_MIN_OBJ_SIZE &&
-      mi_atomic_load_relaxed(&mi_arena_count) > 0)
-  {
-    const size_t bcount = mi_block_count_of_size(size);
-    const int numa_node = _mi_os_numa_node(tld); // current numa node
+  MI_UNUSED_RELEASE(alignment);
+  mi_assert_internal(alignment <= MI_SEGMENT_ALIGN);
+  const size_t bcount = mi_block_count_of_size(size);
+  const size_t arena_index = mi_arena_id_index(arena_id);
+  mi_assert_internal(arena_index < mi_atomic_load_relaxed(&mi_arena_count));
+  mi_assert_internal(size <= mi_arena_block_size(bcount));
+
+  // Check arena suitability
+  mi_arena_t* arena = mi_atomic_load_ptr_acquire(mi_arena_t, &mi_arenas[arena_index]);
+  if (arena == NULL) return NULL;
+  if (!allow_large && arena->is_large) return NULL;
+  if (!mi_arena_id_is_suitable(arena->id, arena->exclusive, req_arena_id)) return NULL;
+  if (req_arena_id == _mi_arena_id_none()) { // in not specific, check numa affinity
+    const bool numa_suitable = (numa_node < 0 || arena->numa_node < 0 || arena->numa_node == numa_node);
+    if (match_numa_node) { if (!numa_suitable) return NULL; }
+                    else { if (numa_suitable) return NULL; }
+  }
 
-    mi_assert_internal(size <= bcount*MI_ARENA_BLOCK_SIZE);
+  // try to allocate
+  void* p = mi_arena_try_alloc_at(arena, arena_index, bcount, commit, memid, tld);
+  mi_assert_internal(p == NULL || _mi_is_aligned(p, alignment));
+  return p;
+}
+
+
+// allocate from an arena with fallback to the OS
+static mi_decl_noinline void* mi_arena_try_alloc(int numa_node, size_t size, size_t alignment,
+                                                  bool commit, bool allow_large,
+                                                  mi_arena_id_t req_arena_id, mi_memid_t* memid, mi_os_tld_t* tld )
+{
+  MI_UNUSED(alignment);
+  mi_assert_internal(alignment <= MI_SEGMENT_ALIGN);
+  const size_t max_arena = mi_atomic_load_relaxed(&mi_arena_count);
+  if mi_likely(max_arena == 0) return NULL;
+
+  if (req_arena_id != _mi_arena_id_none()) {
+    // try a specific arena if requested
+    if (mi_arena_id_index(req_arena_id) < max_arena) {
+      void* p = mi_arena_try_alloc_at_id(req_arena_id, true, numa_node, size, alignment, commit, allow_large, req_arena_id, memid, tld);
+      if (p != NULL) return p;
+    }
+  }
+  else {
     // try numa affine allocation
-    for (size_t i = 0; i < MI_MAX_ARENAS; i++) {
-      mi_arena_t* arena = mi_atomic_load_ptr_relaxed(mi_arena_t, &mi_arenas[i]);
-      if (arena==NULL) break; // end reached
-      if ((arena->numa_node<0 || arena->numa_node==numa_node) && // numa local?
-          (*large || !arena->is_large)) // large OS pages allowed, or arena is not large OS pages
-      {
-        void* p = mi_arena_alloc_from(arena, i, bcount, commit, large, is_pinned, is_zero, memid, tld);
-        mi_assert_internal((uintptr_t)p % alignment == 0);
-        if (p != NULL) return p;
-      }
+    for (size_t i = 0; i < max_arena; i++) {
+      void* p = mi_arena_try_alloc_at_id(mi_arena_id_create(i), true, numa_node, size, alignment, commit, allow_large, req_arena_id, memid, tld);
+      if (p != NULL) return p;
     }
+
     // try from another numa node instead..
-    for (size_t i = 0; i < MI_MAX_ARENAS; i++) {
-      mi_arena_t* arena = mi_atomic_load_ptr_relaxed(mi_arena_t, &mi_arenas[i]);
-      if (arena==NULL) break; // end reached
-      if ((arena->numa_node>=0 && arena->numa_node!=numa_node) && // not numa local!
-          (*large || !arena->is_large)) // large OS pages allowed, or arena is not large OS pages
-      {
-        void* p = mi_arena_alloc_from(arena, i, bcount, commit, large, is_pinned, is_zero, memid, tld);
-        mi_assert_internal((uintptr_t)p % alignment == 0);
+    if (numa_node >= 0) {  // if numa_node was < 0 (no specific affinity requested), all arena's have been tried already
+      for (size_t i = 0; i < max_arena; i++) {
+        void* p = mi_arena_try_alloc_at_id(mi_arena_id_create(i), false /* only proceed if not numa local */, numa_node, size, alignment, commit, allow_large, req_arena_id, memid, tld);
         if (p != NULL) return p;
       }
     }
   }
+  return NULL;
+}
 
-  // finally, fall back to the OS
-  if (mi_option_is_enabled(mi_option_limit_os_alloc)) {
+// try to reserve a fresh arena space
+static bool mi_arena_reserve(size_t req_size, bool allow_large, mi_arena_id_t req_arena_id, mi_arena_id_t *arena_id)
+{
+  if (_mi_preloading()) return false;  // use OS only while pre loading
+  if (req_arena_id != _mi_arena_id_none()) return false;
+
+  const size_t arena_count = mi_atomic_load_acquire(&mi_arena_count);
+  if (arena_count > (MI_MAX_ARENAS - 4)) return false;
+
+  size_t arena_reserve = mi_option_get_size(mi_option_arena_reserve);
+  if (arena_reserve == 0) return false;
+
+  if (!_mi_os_has_virtual_reserve()) {
+    arena_reserve = arena_reserve/4;  // be conservative if virtual reserve is not supported (for WASM for example)
+  }
+  arena_reserve = _mi_align_up(arena_reserve, MI_ARENA_BLOCK_SIZE);
+  if (arena_count >= 8 && arena_count <= 128) {
+    arena_reserve = ((size_t)1<<(arena_count/8)) * arena_reserve;  // scale up the arena sizes exponentially
+  }
+  if (arena_reserve < req_size) return false;  // should be able to at least handle the current allocation size
+
+  // commit eagerly?
+  bool arena_commit = false;
+  if (mi_option_get(mi_option_arena_eager_commit) == 2)      { arena_commit = _mi_os_has_overcommit(); }
+  else if (mi_option_get(mi_option_arena_eager_commit) == 1) { arena_commit = true; }
+
+  return (mi_reserve_os_memory_ex(arena_reserve, arena_commit, allow_large, false /* exclusive? */, arena_id) == 0);
+}
+
+
+void* _mi_arena_alloc_aligned(size_t size, size_t alignment, size_t align_offset, bool commit, bool allow_large,
+                              mi_arena_id_t req_arena_id, mi_memid_t* memid, mi_os_tld_t* tld)
+{
+  mi_assert_internal(memid != NULL && tld != NULL);
+  mi_assert_internal(size > 0);
+  *memid = _mi_memid_none();
+
+  const int numa_node = _mi_os_numa_node(tld); // current numa node
+
+  // try to allocate in an arena if the alignment is small enough and the object is not too small (as for heap meta data)
+  if (!mi_option_is_enabled(mi_option_disallow_arena_alloc) || req_arena_id != _mi_arena_id_none()) {  // is arena allocation allowed?
+    if (size >= MI_ARENA_MIN_OBJ_SIZE && alignment <= MI_SEGMENT_ALIGN && align_offset == 0) {
+      void* p = mi_arena_try_alloc(numa_node, size, alignment, commit, allow_large, req_arena_id, memid, tld);
+      if (p != NULL) return p;
+
+      // otherwise, try to first eagerly reserve a new arena
+      if (req_arena_id == _mi_arena_id_none()) {
+        mi_arena_id_t arena_id = 0;
+        if (mi_arena_reserve(size, allow_large, req_arena_id, &arena_id)) {
+          // and try allocate in there
+          mi_assert_internal(req_arena_id == _mi_arena_id_none());
+          p = mi_arena_try_alloc_at_id(arena_id, true, numa_node, size, alignment, commit, allow_large, req_arena_id, memid, tld);
+          if (p != NULL) return p;
+        }
+      }
+    }
+  }
+
+  // if we cannot use OS allocation, return NULL
+  if (mi_option_is_enabled(mi_option_disallow_os_alloc) || req_arena_id != _mi_arena_id_none()) {
     errno = ENOMEM;
     return NULL;
   }
-  *is_zero = true;
-  *memid   = MI_MEMID_OS;  
-  void* p = _mi_os_alloc_aligned(size, alignment, *commit, large, tld->stats);
-  if (p != NULL) *is_pinned = *large;
-  return p;
+
+  // finally, fall back to the OS
+  if (align_offset > 0) {
+    return _mi_os_alloc_aligned_at_offset(size, alignment, align_offset, commit, allow_large, memid, tld->stats);
+  }
+  else {
+    return _mi_os_alloc_aligned(size, alignment, commit, allow_large, memid, tld->stats);
+  }
 }
 
-void* _mi_arena_alloc(size_t size, bool* commit, bool* large, bool* is_pinned, bool* is_zero, size_t* memid, mi_os_tld_t* tld)
+void* _mi_arena_alloc(size_t size, bool commit, bool allow_large, mi_arena_id_t req_arena_id, mi_memid_t* memid, mi_os_tld_t* tld)
 {
-  return _mi_arena_alloc_aligned(size, MI_ARENA_BLOCK_SIZE, commit, large, is_pinned, is_zero, memid, tld);
+  return _mi_arena_alloc_aligned(size, MI_ARENA_BLOCK_SIZE, 0, commit, allow_large, req_arena_id, memid, tld);
 }
 
+
+void* mi_arena_area(mi_arena_id_t arena_id, size_t* size) {
+  if (size != NULL) *size = 0;
+  size_t arena_index = mi_arena_id_index(arena_id);
+  if (arena_index >= MI_MAX_ARENAS) return NULL;
+  mi_arena_t* arena = mi_atomic_load_ptr_acquire(mi_arena_t, &mi_arenas[arena_index]);
+  if (arena == NULL) return NULL;
+  if (size != NULL) { *size = mi_arena_block_size(arena->block_count); }
+  return arena->start;
+}
+
+
+/* -----------------------------------------------------------
+  Arena purge
+----------------------------------------------------------- */
+
+static long mi_arena_purge_delay(void) {
+  // <0 = no purging allowed, 0=immediate purging, >0=milli-second delay
+  return (mi_option_get(mi_option_purge_delay) * mi_option_get(mi_option_arena_purge_mult));
+}
+
+// reset or decommit in an arena and update the committed/decommit bitmaps
+// assumes we own the area (i.e. blocks_in_use is claimed by us)
+static void mi_arena_purge(mi_arena_t* arena, size_t bitmap_idx, size_t blocks, mi_stats_t* stats) {
+  mi_assert_internal(arena->blocks_committed != NULL);
+  mi_assert_internal(arena->blocks_purge != NULL);
+  mi_assert_internal(!arena->memid.is_pinned);
+  const size_t size = mi_arena_block_size(blocks);
+  void* const p = mi_arena_block_start(arena, bitmap_idx);
+  bool needs_recommit;
+  if (_mi_bitmap_is_claimed_across(arena->blocks_committed, arena->field_count, blocks, bitmap_idx)) {
+    // all blocks are committed, we can purge freely
+    needs_recommit = _mi_os_purge(p, size, stats);
+  }
+  else {
+    // some blocks are not committed -- this can happen when a partially committed block is freed
+    // in `_mi_arena_free` and it is conservatively marked as uncommitted but still scheduled for a purge
+    // we need to ensure we do not try to reset (as that may be invalid for uncommitted memory),
+    // and also undo the decommit stats (as it was already adjusted)
+    mi_assert_internal(mi_option_is_enabled(mi_option_purge_decommits));
+    needs_recommit = _mi_os_purge_ex(p, size, false /* allow reset? */, stats);
+    if (needs_recommit) { _mi_stat_increase(&_mi_stats_main.committed, size); }
+  }
+
+  // clear the purged blocks
+  _mi_bitmap_unclaim_across(arena->blocks_purge, arena->field_count, blocks, bitmap_idx);
+  // update committed bitmap
+  if (needs_recommit) {
+    _mi_bitmap_unclaim_across(arena->blocks_committed, arena->field_count, blocks, bitmap_idx);
+  }
+}
+
+// Schedule a purge. This is usually delayed to avoid repeated decommit/commit calls.
+// Note: assumes we (still) own the area as we may purge immediately
+static void mi_arena_schedule_purge(mi_arena_t* arena, size_t bitmap_idx, size_t blocks, mi_stats_t* stats) {
+  mi_assert_internal(arena->blocks_purge != NULL);
+  const long delay = mi_arena_purge_delay();
+  if (delay < 0) return;  // is purging allowed at all?
+
+  if (_mi_preloading() || delay == 0) {
+    // decommit directly
+    mi_arena_purge(arena, bitmap_idx, blocks, stats);
+  }
+  else {
+    // schedule decommit
+    mi_msecs_t expire = mi_atomic_loadi64_relaxed(&arena->purge_expire);
+    if (expire != 0) {
+      mi_atomic_addi64_acq_rel(&arena->purge_expire, (mi_msecs_t)(delay/10));  // add smallish extra delay
+    }
+    else {
+      mi_atomic_storei64_release(&arena->purge_expire, _mi_clock_now() + delay);
+    }
+    _mi_bitmap_claim_across(arena->blocks_purge, arena->field_count, blocks, bitmap_idx, NULL);
+  }
+}
+
+// purge a range of blocks
+// return true if the full range was purged.
+// assumes we own the area (i.e. blocks_in_use is claimed by us)
+static bool mi_arena_purge_range(mi_arena_t* arena, size_t idx, size_t startidx, size_t bitlen, size_t purge, mi_stats_t* stats) {
+  const size_t endidx = startidx + bitlen;
+  size_t bitidx = startidx;
+  bool all_purged = false;
+  while (bitidx < endidx) {
+    // count consequetive ones in the purge mask
+    size_t count = 0;
+    while (bitidx + count < endidx && (purge & ((size_t)1 << (bitidx + count))) != 0) {
+      count++;
+    }
+    if (count > 0) {
+      // found range to be purged
+      const mi_bitmap_index_t range_idx = mi_bitmap_index_create(idx, bitidx);
+      mi_arena_purge(arena, range_idx, count, stats);
+      if (count == bitlen) {
+        all_purged = true;
+      }
+    }
+    bitidx += (count+1); // +1 to skip the zero bit (or end)
+  }
+  return all_purged;
+}
+
+// returns true if anything was purged
+static bool mi_arena_try_purge(mi_arena_t* arena, mi_msecs_t now, bool force, mi_stats_t* stats)
+{
+  if (arena->memid.is_pinned || arena->blocks_purge == NULL) return false;
+  mi_msecs_t expire = mi_atomic_loadi64_relaxed(&arena->purge_expire);
+  if (expire == 0) return false;
+  if (!force && expire > now) return false;
+
+  // reset expire (if not already set concurrently)
+  mi_atomic_casi64_strong_acq_rel(&arena->purge_expire, &expire, (mi_msecs_t)0);
+
+  // potential purges scheduled, walk through the bitmap
+  bool any_purged = false;
+  bool full_purge = true;
+  for (size_t i = 0; i < arena->field_count; i++) {
+    size_t purge = mi_atomic_load_relaxed(&arena->blocks_purge[i]);
+    if (purge != 0) {
+      size_t bitidx = 0;
+      while (bitidx < MI_BITMAP_FIELD_BITS) {
+        // find consequetive range of ones in the purge mask
+        size_t bitlen = 0;
+        while (bitidx + bitlen < MI_BITMAP_FIELD_BITS && (purge & ((size_t)1 << (bitidx + bitlen))) != 0) {
+          bitlen++;
+        }
+        // try to claim the longest range of corresponding in_use bits
+        const mi_bitmap_index_t bitmap_index = mi_bitmap_index_create(i, bitidx);
+        while( bitlen > 0 ) {
+          if (_mi_bitmap_try_claim(arena->blocks_inuse, arena->field_count, bitlen, bitmap_index)) {
+            break;
+          }
+          bitlen--;
+        }
+        // actual claimed bits at `in_use`
+        if (bitlen > 0) {
+          // read purge again now that we have the in_use bits
+          purge = mi_atomic_load_acquire(&arena->blocks_purge[i]);
+          if (!mi_arena_purge_range(arena, i, bitidx, bitlen, purge, stats)) {
+            full_purge = false;
+          }
+          any_purged = true;
+          // release the claimed `in_use` bits again
+          _mi_bitmap_unclaim(arena->blocks_inuse, arena->field_count, bitlen, bitmap_index);
+        }
+        bitidx += (bitlen+1);  // +1 to skip the zero (or end)
+      } // while bitidx
+    } // purge != 0
+  }
+  // if not fully purged, make sure to purge again in the future
+  if (!full_purge) {
+    const long delay = mi_arena_purge_delay();
+    mi_msecs_t expected = 0;
+    mi_atomic_casi64_strong_acq_rel(&arena->purge_expire,&expected,_mi_clock_now() + delay);
+  }
+  return any_purged;
+}
+
+static void mi_arenas_try_purge( bool force, bool visit_all, mi_stats_t* stats ) {
+  if (_mi_preloading() || mi_arena_purge_delay() <= 0) return;  // nothing will be scheduled
+
+  const size_t max_arena = mi_atomic_load_acquire(&mi_arena_count);
+  if (max_arena == 0) return;
+
+  // allow only one thread to purge at a time
+  static mi_atomic_guard_t purge_guard;
+  mi_atomic_guard(&purge_guard)
+  {
+    mi_msecs_t now = _mi_clock_now();
+    size_t max_purge_count = (visit_all ? max_arena : 1);
+    for (size_t i = 0; i < max_arena; i++) {
+      mi_arena_t* arena = mi_atomic_load_ptr_acquire(mi_arena_t, &mi_arenas[i]);
+      if (arena != NULL) {
+        if (mi_arena_try_purge(arena, now, force, stats)) {
+          if (max_purge_count <= 1) break;
+          max_purge_count--;
+        }
+      }
+    }
+  }
+}
+
+
 /* -----------------------------------------------------------
   Arena free
 ----------------------------------------------------------- */
 
-void _mi_arena_free(void* p, size_t size, size_t memid, bool all_committed, mi_stats_t* stats) {
+void _mi_arena_free(void* p, size_t size, size_t committed_size, mi_memid_t memid, mi_stats_t* stats) {
   mi_assert_internal(size > 0 && stats != NULL);
+  mi_assert_internal(committed_size <= size);
   if (p==NULL) return;
   if (size==0) return;
-  if (memid == MI_MEMID_OS) {
+  const bool all_committed = (committed_size == size);
+
+  if (mi_memkind_is_os(memid.memkind)) {
     // was a direct OS allocation, pass through
-    _mi_os_free_ex(p, size, all_committed, stats);
+    if (!all_committed && committed_size > 0) {
+      // if partially committed, adjust the committed stats (as `_mi_os_free` will increase decommit by the full size)
+      _mi_stat_decrease(&_mi_stats_main.committed, committed_size);
+    }
+    _mi_os_free(p, size, memid, stats);
   }
-  else {
+  else if (memid.memkind == MI_MEM_ARENA) {
     // allocated in an arena
     size_t arena_idx;
     size_t bitmap_idx;
-    mi_arena_id_indices(memid, &arena_idx, &bitmap_idx);
+    mi_arena_memid_indices(memid, &arena_idx, &bitmap_idx);
     mi_assert_internal(arena_idx < MI_MAX_ARENAS);
-    mi_arena_t* arena = mi_atomic_load_ptr_relaxed(mi_arena_t,&mi_arenas[arena_idx]);
+    mi_arena_t* arena = mi_atomic_load_ptr_acquire(mi_arena_t,&mi_arenas[arena_idx]);
     mi_assert_internal(arena != NULL);
     const size_t blocks = mi_block_count_of_size(size);
+
     // checks
     if (arena == NULL) {
-      _mi_error_message(EINVAL, "trying to free from non-existent arena: %p, size %zu, memid: 0x%zx\n", p, size, memid);
+      _mi_error_message(EINVAL, "trying to free from an invalid arena: %p, size %zu, memid: 0x%zx\n", p, size, memid);
       return;
     }
     mi_assert_internal(arena->field_count > mi_bitmap_index_field(bitmap_idx));
     if (arena->field_count <= mi_bitmap_index_field(bitmap_idx)) {
-      _mi_error_message(EINVAL, "trying to free from non-existent arena block: %p, size %zu, memid: 0x%zx\n", p, size, memid);
+      _mi_error_message(EINVAL, "trying to free from an invalid arena block: %p, size %zu, memid: 0x%zx\n", p, size, memid);
       return;
     }
+
+    // need to set all memory to undefined as some parts may still be marked as no_access (like padding etc.)
+    mi_track_mem_undefined(p,size);
+
     // potentially decommit
-    if (arena->is_committed) {
-      mi_assert_internal(all_committed); 
+    if (arena->memid.is_pinned || arena->blocks_committed == NULL) {
+      mi_assert_internal(all_committed);
     }
     else {
       mi_assert_internal(arena->blocks_committed != NULL);
-      _mi_os_decommit(p, blocks * MI_ARENA_BLOCK_SIZE, stats); // ok if this fails
-      _mi_bitmap_unclaim_across(arena->blocks_committed, arena->field_count, blocks, bitmap_idx);
+      mi_assert_internal(arena->blocks_purge != NULL);
+
+      if (!all_committed) {
+        // mark the entire range as no longer committed (so we recommit the full range when re-using)
+        _mi_bitmap_unclaim_across(arena->blocks_committed, arena->field_count, blocks, bitmap_idx);
+        mi_track_mem_noaccess(p,size);
+        if (committed_size > 0) {
+          // if partially committed, adjust the committed stats (is it will be recommitted when re-using)
+          // in the delayed purge, we now need to not count a decommit if the range is not marked as committed.
+          _mi_stat_decrease(&_mi_stats_main.committed, committed_size);
+        }
+        // note: if not all committed, it may be that the purge will reset/decommit the entire range
+        // that contains already decommitted parts. Since purge consistently uses reset or decommit that
+        // works (as we should never reset decommitted parts).
+      }
+      // (delay) purge the entire range
+      mi_arena_schedule_purge(arena, bitmap_idx, blocks, stats);
     }
-    // and make it available to others again 
+
+    // and make it available to others again
     bool all_inuse = _mi_bitmap_unclaim_across(arena->blocks_inuse, arena->field_count, blocks, bitmap_idx);
     if (!all_inuse) {
-      _mi_error_message(EAGAIN, "trying to free an already freed block: %p, size %zu\n", p, size);
+      _mi_error_message(EAGAIN, "trying to free an already freed arena block: %p, size %zu\n", p, size);
       return;
     };
   }
+  else {
+    // arena was none, external, or static; nothing to do
+    mi_assert_internal(memid.memkind < MI_MEM_OS);
+  }
+
+  // purge expired decommits
+  mi_arenas_try_purge(false, false, stats);
+}
+
+// destroy owned arenas; this is unsafe and should only be done using `mi_option_destroy_on_exit`
+// for dynamic libraries that are unloaded and need to release all their allocated memory.
+static void mi_arenas_unsafe_destroy(void) {
+  const size_t max_arena = mi_atomic_load_relaxed(&mi_arena_count);
+  size_t new_max_arena = 0;
+  for (size_t i = 0; i < max_arena; i++) {
+    mi_arena_t* arena = mi_atomic_load_ptr_acquire(mi_arena_t, &mi_arenas[i]);
+    if (arena != NULL) {
+      if (arena->start != NULL && mi_memkind_is_os(arena->memid.memkind)) {
+        mi_atomic_store_ptr_release(mi_arena_t, &mi_arenas[i], NULL);
+        _mi_os_free(arena->start, mi_arena_size(arena), arena->memid, &_mi_stats_main);
+      }
+      else {
+        new_max_arena = i;
+      }
+      mi_arena_meta_free(arena, arena->meta_memid, arena->meta_size, &_mi_stats_main);
+    }
+  }
+
+  // try to lower the max arena.
+  size_t expected = max_arena;
+  mi_atomic_cas_strong_acq_rel(&mi_arena_count, &expected, new_max_arena);
+}
+
+// Purge the arenas; if `force_purge` is true, amenable parts are purged even if not yet expired
+void _mi_arenas_collect(bool force_purge, mi_stats_t* stats) {
+  mi_arenas_try_purge(force_purge, force_purge /* visit all? */, stats);
+}
+
+// destroy owned arenas; this is unsafe and should only be done using `mi_option_destroy_on_exit`
+// for dynamic libraries that are unloaded and need to release all their allocated memory.
+void _mi_arena_unsafe_destroy_all(mi_stats_t* stats) {
+  mi_arenas_unsafe_destroy();
+  _mi_arenas_collect(true /* force purge */, stats);  // purge non-owned arenas
+}
+
+// Is a pointer inside any of our arenas?
+bool _mi_arena_contains(const void* p) {
+  const size_t max_arena = mi_atomic_load_relaxed(&mi_arena_count);
+  for (size_t i = 0; i < max_arena; i++) {
+    mi_arena_t* arena = mi_atomic_load_ptr_acquire(mi_arena_t, &mi_arenas[i]);
+    if (arena != NULL && arena->start <= (const uint8_t*)p && arena->start + mi_arena_block_size(arena->block_count) > (const uint8_t*)p) {
+      return true;
+    }
+  }
+  return false;
+}
+
+/* -----------------------------------------------------------
+  Abandoned blocks/segments.
+  This is used to atomically abandon/reclaim segments 
+  (and crosses the arena API but it is convenient to have here).
+  Abandoned segments still have live blocks; they get reclaimed
+  when a thread frees a block in it, or when a thread needs a fresh
+  segment; these threads scan the abandoned segments through
+  the arena bitmaps.
+----------------------------------------------------------- */
+
+// Maintain a count of all abandoned segments
+static mi_decl_cache_align _Atomic(size_t)abandoned_count;
+
+size_t _mi_arena_segment_abandoned_count(void) {
+  return mi_atomic_load_relaxed(&abandoned_count);
+}
+
+// reclaim a specific abandoned segment; `true` on success.
+// sets the thread_id.
+bool _mi_arena_segment_clear_abandoned(mi_segment_t* segment ) 
+{
+  if (segment->memid.memkind != MI_MEM_ARENA) {
+    // not in an arena, consider it un-abandoned now.
+    // but we need to still claim it atomically -- we use the thread_id for that.
+    size_t expected = 0;
+    if (mi_atomic_cas_strong_acq_rel(&segment->thread_id, &expected, _mi_thread_id())) {
+      mi_atomic_decrement_relaxed(&abandoned_count);
+      return true;
+    }
+    else {
+      return false;
+    }
+  }
+  // arena segment: use the blocks_abandoned bitmap.
+  size_t arena_idx;
+  size_t bitmap_idx;
+  mi_arena_memid_indices(segment->memid, &arena_idx, &bitmap_idx);
+  mi_assert_internal(arena_idx < MI_MAX_ARENAS);
+  mi_arena_t* arena = mi_atomic_load_ptr_acquire(mi_arena_t, &mi_arenas[arena_idx]);
+  mi_assert_internal(arena != NULL);
+  bool was_marked = _mi_bitmap_unclaim(arena->blocks_abandoned, arena->field_count, 1, bitmap_idx);
+  if (was_marked) { 
+    mi_assert_internal(mi_atomic_load_relaxed(&segment->thread_id) == 0);
+    mi_atomic_decrement_relaxed(&abandoned_count); 
+    mi_atomic_store_release(&segment->thread_id, _mi_thread_id());
+  }
+  // mi_assert_internal(was_marked);
+  mi_assert_internal(!was_marked || _mi_bitmap_is_claimed(arena->blocks_inuse, arena->field_count, 1, bitmap_idx));
+  //mi_assert_internal(arena->blocks_committed == NULL || _mi_bitmap_is_claimed(arena->blocks_committed, arena->field_count, 1, bitmap_idx));
+  return was_marked;
+}
+
+// mark a specific segment as abandoned
+// clears the thread_id.
+void _mi_arena_segment_mark_abandoned(mi_segment_t* segment) 
+{
+  mi_atomic_store_release(&segment->thread_id, 0);
+  mi_assert_internal(segment->used == segment->abandoned);
+  if (segment->memid.memkind != MI_MEM_ARENA) {
+    // not in an arena; count it as abandoned and return
+    mi_atomic_increment_relaxed(&abandoned_count);
+    return;
+  }
+  size_t arena_idx;
+  size_t bitmap_idx;
+  mi_arena_memid_indices(segment->memid, &arena_idx, &bitmap_idx);
+  mi_assert_internal(arena_idx < MI_MAX_ARENAS);
+  mi_arena_t* arena = mi_atomic_load_ptr_acquire(mi_arena_t, &mi_arenas[arena_idx]);
+  mi_assert_internal(arena != NULL);
+  const bool was_unmarked = _mi_bitmap_claim(arena->blocks_abandoned, arena->field_count, 1, bitmap_idx, NULL);
+  if (was_unmarked) { mi_atomic_increment_relaxed(&abandoned_count); }
+  mi_assert_internal(was_unmarked);
+  mi_assert_internal(_mi_bitmap_is_claimed(arena->blocks_inuse, arena->field_count, 1, bitmap_idx));
+}
+
+// start a cursor at a randomized arena
+void _mi_arena_field_cursor_init(mi_heap_t* heap, mi_arena_field_cursor_t* current) {
+  const size_t max_arena = mi_atomic_load_relaxed(&mi_arena_count);
+  current->start = (max_arena == 0 ? 0 : (mi_arena_id_t)( _mi_heap_random_next(heap) % max_arena));
+  current->count = 0;
+  current->bitmap_idx = 0;  
+}
+
+// reclaim abandoned segments 
+// this does not set the thread id (so it appears as still abandoned)
+mi_segment_t* _mi_arena_segment_clear_abandoned_next(mi_arena_field_cursor_t* previous ) 
+{
+  const int max_arena = (int)mi_atomic_load_relaxed(&mi_arena_count);
+  if (max_arena <= 0 || mi_atomic_load_relaxed(&abandoned_count) == 0) return NULL;
+
+  int count = previous->count;
+  size_t field_idx = mi_bitmap_index_field(previous->bitmap_idx);
+  size_t bit_idx = mi_bitmap_index_bit_in_field(previous->bitmap_idx) + 1;
+  // visit arena's (from previous)
+  for (; count < max_arena; count++, field_idx = 0, bit_idx = 0) {
+    mi_arena_id_t arena_idx = previous->start + count;
+    if (arena_idx >= max_arena) { arena_idx = arena_idx % max_arena; } // wrap around
+    mi_arena_t* arena = mi_atomic_load_ptr_acquire(mi_arena_t, &mi_arenas[arena_idx]);
+    if (arena != NULL) {
+      // visit the abandoned fields (starting at previous_idx)
+      for ( ; field_idx < arena->field_count; field_idx++, bit_idx = 0) {
+        size_t field = mi_atomic_load_relaxed(&arena->blocks_abandoned[field_idx]);
+        if mi_unlikely(field != 0) { // skip zero fields quickly
+          // visit each set bit in the field  (todo: maybe use `ctz` here?)
+          for ( ; bit_idx < MI_BITMAP_FIELD_BITS; bit_idx++) {
+            // pre-check if the bit is set
+            size_t mask = ((size_t)1 << bit_idx);
+            if mi_unlikely((field & mask) == mask) {
+              mi_bitmap_index_t bitmap_idx = mi_bitmap_index_create(field_idx, bit_idx);
+              // try to reclaim it atomically
+              if (_mi_bitmap_unclaim(arena->blocks_abandoned, arena->field_count, 1, bitmap_idx)) {
+                mi_atomic_decrement_relaxed(&abandoned_count);
+                previous->bitmap_idx = bitmap_idx;
+                previous->count = count;
+                mi_assert_internal(_mi_bitmap_is_claimed(arena->blocks_inuse, arena->field_count, 1, bitmap_idx));
+                mi_segment_t* segment = (mi_segment_t*)mi_arena_block_start(arena, bitmap_idx);
+                mi_assert_internal(mi_atomic_load_relaxed(&segment->thread_id) == 0);
+                //mi_assert_internal(arena->blocks_committed == NULL || _mi_bitmap_is_claimed(arena->blocks_committed, arena->field_count, 1, bitmap_idx));
+                return segment;
+              }
+            }
+          }
+        }
+      }
+    }
+  }
+  // no more found
+  previous->bitmap_idx = 0;
+  previous->count = 0;
+  return NULL;
 }
 
+
 /* -----------------------------------------------------------
   Add an arena.
 ----------------------------------------------------------- */
 
-static bool mi_arena_add(mi_arena_t* arena) {
+static bool mi_arena_add(mi_arena_t* arena, mi_arena_id_t* arena_id, mi_stats_t* stats) {
   mi_assert_internal(arena != NULL);
   mi_assert_internal((uintptr_t)mi_atomic_load_ptr_relaxed(uint8_t,&arena->start) % MI_SEGMENT_ALIGN == 0);
   mi_assert_internal(arena->block_count > 0);
+  if (arena_id != NULL) { *arena_id = -1; }
 
-  uintptr_t i = mi_atomic_increment_acq_rel(&mi_arena_count);
+  size_t i = mi_atomic_increment_acq_rel(&mi_arena_count);
   if (i >= MI_MAX_ARENAS) {
     mi_atomic_decrement_acq_rel(&mi_arena_count);
     return false;
   }
+  _mi_stat_counter_increase(&stats->arena_count,1);
+  arena->id = mi_arena_id_create(i);
   mi_atomic_store_ptr_release(mi_arena_t,&mi_arenas[i], arena);
+  if (arena_id != NULL) { *arena_id = arena->id; }
   return true;
 }
 
-bool mi_manage_os_memory(void* start, size_t size, bool is_committed, bool is_large, bool is_zero, int numa_node) mi_attr_noexcept
+static bool mi_manage_os_memory_ex2(void* start, size_t size, bool is_large, int numa_node, bool exclusive, mi_memid_t memid, mi_arena_id_t* arena_id) mi_attr_noexcept
 {
+  if (arena_id != NULL) *arena_id = _mi_arena_id_none();
+  if (size < MI_ARENA_BLOCK_SIZE) return false;
+
   if (is_large) {
-    mi_assert_internal(is_committed);
-    is_committed = true;
+    mi_assert_internal(memid.initially_committed && memid.is_pinned);
   }
-  
-  const size_t bcount = mi_block_count_of_size(size);
+
+  const size_t bcount = size / MI_ARENA_BLOCK_SIZE;
   const size_t fields = _mi_divide_up(bcount, MI_BITMAP_FIELD_BITS);
-  const size_t bitmaps = (is_committed ? 2 : 3);
+  const size_t bitmaps = (memid.is_pinned ? 3 : 5);
   const size_t asize  = sizeof(mi_arena_t) + (bitmaps*fields*sizeof(mi_bitmap_field_t));
-  mi_arena_t* arena   = (mi_arena_t*)_mi_os_alloc(asize, &_mi_stats_main); // TODO: can we avoid allocating from the OS?
+  mi_memid_t meta_memid;
+  mi_arena_t* arena   = (mi_arena_t*)mi_arena_meta_zalloc(asize, &meta_memid, &_mi_stats_main); // TODO: can we avoid allocating from the OS?
   if (arena == NULL) return false;
 
+  // already zero'd due to zalloc
+  // _mi_memzero(arena, asize);
+  arena->id = _mi_arena_id_none();
+  arena->memid = memid;
+  arena->exclusive = exclusive;
+  arena->meta_size = asize;
+  arena->meta_memid = meta_memid;
   arena->block_count = bcount;
   arena->field_count = fields;
   arena->start = (uint8_t*)start;
   arena->numa_node    = numa_node; // TODO: or get the current numa node if -1? (now it allows anyone to allocate on -1)
   arena->is_large     = is_large;
-  arena->is_zero_init = is_zero;
-  arena->is_committed = is_committed;
+  arena->purge_expire = 0;
   arena->search_idx   = 0;
-  arena->blocks_dirty = &arena->blocks_inuse[fields]; // just after inuse bitmap
-  arena->blocks_committed = (is_committed ? NULL : &arena->blocks_inuse[2*fields]); // just after dirty bitmap
-  // the bitmaps are already zero initialized due to os_alloc
-  // just claim leftover blocks if needed
+  // consequetive bitmaps
+  arena->blocks_dirty     = &arena->blocks_inuse[fields];     // just after inuse bitmap
+  arena->blocks_abandoned = &arena->blocks_inuse[2 * fields]; // just after dirty bitmap
+  arena->blocks_committed = (arena->memid.is_pinned ? NULL : &arena->blocks_inuse[3*fields]); // just after abandoned bitmap
+  arena->blocks_purge     = (arena->memid.is_pinned ? NULL : &arena->blocks_inuse[4*fields]); // just after committed bitmap
+  // initialize committed bitmap?
+  if (arena->blocks_committed != NULL && arena->memid.initially_committed) {
+    memset((void*)arena->blocks_committed, 0xFF, fields*sizeof(mi_bitmap_field_t)); // cast to void* to avoid atomic warning
+  }
+
+  // and claim leftover blocks if needed (so we never allocate there)
   ptrdiff_t post = (fields * MI_BITMAP_FIELD_BITS) - bcount;
   mi_assert_internal(post >= 0);
   if (post > 0) {
@@ -313,52 +939,132 @@ bool mi_manage_os_memory(void* start, size_t size, bool is_committed, bool is_la
     mi_bitmap_index_t postidx = mi_bitmap_index_create(fields - 1, MI_BITMAP_FIELD_BITS - post);
     _mi_bitmap_claim(arena->blocks_inuse, fields, post, postidx, NULL);
   }
+  return mi_arena_add(arena, arena_id, &_mi_stats_main);
 
-  mi_arena_add(arena);
-  return true;
+}
+
+bool mi_manage_os_memory_ex(void* start, size_t size, bool is_committed, bool is_large, bool is_zero, int numa_node, bool exclusive, mi_arena_id_t* arena_id) mi_attr_noexcept {
+  mi_memid_t memid = _mi_memid_create(MI_MEM_EXTERNAL);
+  memid.initially_committed = is_committed;
+  memid.initially_zero = is_zero;
+  memid.is_pinned = is_large;
+  return mi_manage_os_memory_ex2(start,size,is_large,numa_node,exclusive,memid, arena_id);
 }
 
 // Reserve a range of regular OS memory
-int mi_reserve_os_memory(size_t size, bool commit, bool allow_large) mi_attr_noexcept 
-{
-  size = _mi_os_good_alloc_size(size);
-  bool large = allow_large;
-  void* start = _mi_os_alloc_aligned(size, MI_SEGMENT_ALIGN, commit, &large, &_mi_stats_main);
-  if (start==NULL) return ENOMEM;
-  if (!mi_manage_os_memory(start, size, (large || commit), large, true, -1)) {
-    _mi_os_free_ex(start, size, commit, &_mi_stats_main);
-    _mi_verbose_message("failed to reserve %zu k memory\n", _mi_divide_up(size,1024));
+int mi_reserve_os_memory_ex(size_t size, bool commit, bool allow_large, bool exclusive, mi_arena_id_t* arena_id) mi_attr_noexcept {
+  if (arena_id != NULL) *arena_id = _mi_arena_id_none();
+  size = _mi_align_up(size, MI_ARENA_BLOCK_SIZE); // at least one block
+  mi_memid_t memid;
+  void* start = _mi_os_alloc_aligned(size, MI_SEGMENT_ALIGN, commit, allow_large, &memid, &_mi_stats_main);
+  if (start == NULL) return ENOMEM;
+  const bool is_large = memid.is_pinned; // todo: use separate is_large field?
+  if (!mi_manage_os_memory_ex2(start, size, is_large, -1 /* numa node */, exclusive, memid, arena_id)) {
+    _mi_os_free_ex(start, size, commit, memid, &_mi_stats_main);
+    _mi_verbose_message("failed to reserve %zu KiB memory\n", _mi_divide_up(size, 1024));
     return ENOMEM;
   }
-  _mi_verbose_message("reserved %zu kb memory%s\n", _mi_divide_up(size,1024), large ? " (in large os pages)" : "");
+  _mi_verbose_message("reserved %zu KiB memory%s\n", _mi_divide_up(size, 1024), is_large ? " (in large os pages)" : "");
   return 0;
 }
 
 
+// Manage a range of regular OS memory
+bool mi_manage_os_memory(void* start, size_t size, bool is_committed, bool is_large, bool is_zero, int numa_node) mi_attr_noexcept {
+  return mi_manage_os_memory_ex(start, size, is_committed, is_large, is_zero, numa_node, false /* exclusive? */, NULL);
+}
+
+// Reserve a range of regular OS memory
+int mi_reserve_os_memory(size_t size, bool commit, bool allow_large) mi_attr_noexcept {
+  return mi_reserve_os_memory_ex(size, commit, allow_large, false, NULL);
+}
+
+
+/* -----------------------------------------------------------
+  Debugging
+----------------------------------------------------------- */
+
+static size_t mi_debug_show_bitmap(const char* prefix, const char* header, size_t block_count, mi_bitmap_field_t* fields, size_t field_count ) {
+  _mi_verbose_message("%s%s:\n", prefix, header);
+  size_t bcount = 0;
+  size_t inuse_count = 0;
+  for (size_t i = 0; i < field_count; i++) {
+    char buf[MI_BITMAP_FIELD_BITS + 1];
+    uintptr_t field = mi_atomic_load_relaxed(&fields[i]);
+    for (size_t bit = 0; bit < MI_BITMAP_FIELD_BITS; bit++, bcount++) {
+      if (bcount < block_count) {
+        bool inuse = ((((uintptr_t)1 << bit) & field) != 0);
+        if (inuse) inuse_count++;
+        buf[bit] = (inuse ? 'x' : '.');
+      }
+      else {
+        buf[bit] = ' ';
+      }
+    }
+    buf[MI_BITMAP_FIELD_BITS] = 0;
+    _mi_verbose_message("%s  %s\n", prefix, buf);
+  }
+  _mi_verbose_message("%s  total ('x'): %zu\n", prefix, inuse_count);
+  return inuse_count;
+}
+
+void mi_debug_show_arenas(bool show_inuse, bool show_abandoned, bool show_purge) mi_attr_noexcept {
+  size_t max_arenas = mi_atomic_load_relaxed(&mi_arena_count);
+  size_t inuse_total = 0;
+  size_t abandoned_total = 0;
+  size_t purge_total = 0;
+  for (size_t i = 0; i < max_arenas; i++) {
+    mi_arena_t* arena = mi_atomic_load_ptr_relaxed(mi_arena_t, &mi_arenas[i]);
+    if (arena == NULL) break;
+    _mi_verbose_message("arena %zu: %zu blocks of size %zuMiB (in %zu fields) %s\n", i, arena->block_count, MI_ARENA_BLOCK_SIZE / MI_MiB, arena->field_count, (arena->memid.is_pinned ? ", pinned" : ""));
+    if (show_inuse) {
+      inuse_total += mi_debug_show_bitmap("  ", "inuse blocks", arena->block_count, arena->blocks_inuse, arena->field_count);
+    }
+    if (arena->blocks_committed != NULL) {
+      mi_debug_show_bitmap("  ", "committed blocks", arena->block_count, arena->blocks_committed, arena->field_count);
+    }
+    if (show_abandoned) {
+      abandoned_total += mi_debug_show_bitmap("  ", "abandoned blocks", arena->block_count, arena->blocks_abandoned, arena->field_count);      
+    }
+    if (show_purge && arena->blocks_purge != NULL) {
+      purge_total += mi_debug_show_bitmap("  ", "purgeable blocks", arena->block_count, arena->blocks_purge, arena->field_count);
+    }
+  }
+  if (show_inuse)     _mi_verbose_message("total inuse blocks    : %zu\n", inuse_total);
+  if (show_abandoned) _mi_verbose_message("total abandoned blocks: %zu\n", abandoned_total);
+  if (show_purge)     _mi_verbose_message("total purgeable blocks: %zu\n", purge_total);
+}
+
+
 /* -----------------------------------------------------------
   Reserve a huge page arena.
 ----------------------------------------------------------- */
 // reserve at a specific numa node
-int mi_reserve_huge_os_pages_at(size_t pages, int numa_node, size_t timeout_msecs) mi_attr_noexcept {
+int mi_reserve_huge_os_pages_at_ex(size_t pages, int numa_node, size_t timeout_msecs, bool exclusive, mi_arena_id_t* arena_id) mi_attr_noexcept {
+  if (arena_id != NULL) *arena_id = -1;
   if (pages==0) return 0;
   if (numa_node < -1) numa_node = -1;
   if (numa_node >= 0) numa_node = numa_node % _mi_os_numa_node_count();
   size_t hsize = 0;
   size_t pages_reserved = 0;
-  void* p = _mi_os_alloc_huge_os_pages(pages, numa_node, timeout_msecs, &pages_reserved, &hsize);
+  mi_memid_t memid;
+  void* p = _mi_os_alloc_huge_os_pages(pages, numa_node, timeout_msecs, &pages_reserved, &hsize, &memid);
   if (p==NULL || pages_reserved==0) {
-    _mi_warning_message("failed to reserve %zu gb huge pages\n", pages);
+    _mi_warning_message("failed to reserve %zu GiB huge pages\n", pages);
     return ENOMEM;
   }
-  _mi_verbose_message("numa node %i: reserved %zu gb huge pages (of the %zu gb requested)\n", numa_node, pages_reserved, pages);
+  _mi_verbose_message("numa node %i: reserved %zu GiB huge pages (of the %zu GiB requested)\n", numa_node, pages_reserved, pages);
 
-  if (!mi_manage_os_memory(p, hsize, true, true, true, numa_node)) {
-    _mi_os_free_huge_pages(p, hsize, &_mi_stats_main);
+  if (!mi_manage_os_memory_ex2(p, hsize, true, numa_node, exclusive, memid, arena_id)) {
+    _mi_os_free(p, hsize, memid, &_mi_stats_main);
     return ENOMEM;
   }
   return 0;
 }
 
+int mi_reserve_huge_os_pages_at(size_t pages, int numa_node, size_t timeout_msecs) mi_attr_noexcept {
+  return mi_reserve_huge_os_pages_at_ex(pages, numa_node, timeout_msecs, false, NULL);
+}
 
 // reserve huge pages evenly among the given number of numa nodes (or use the available ones as detected)
 int mi_reserve_huge_os_pages_interleave(size_t pages, size_t numa_nodes, size_t timeout_msecs) mi_attr_noexcept {
@@ -389,10 +1095,11 @@ int mi_reserve_huge_os_pages_interleave(size_t pages, size_t numa_nodes, size_t
 }
 
 int mi_reserve_huge_os_pages(size_t pages, double max_secs, size_t* pages_reserved) mi_attr_noexcept {
-  UNUSED(max_secs);
+  MI_UNUSED(max_secs);
   _mi_warning_message("mi_reserve_huge_os_pages is deprecated: use mi_reserve_huge_os_pages_interleave/at instead\n");
   if (pages_reserved != NULL) *pages_reserved = 0;
   int err = mi_reserve_huge_os_pages_interleave(pages, 0, (size_t)(max_secs * 1000.0));
   if (err==0 && pages_reserved!=NULL) *pages_reserved = pages;
   return err;
 }
+
diff --git a/contrib/libs/mimalloc/src/bitmap.c b/contrib/libs/mimalloc/src/bitmap.c
index 3b5c8199ca37..976ba72c634c 100644
--- a/contrib/libs/mimalloc/src/bitmap.c
+++ b/contrib/libs/mimalloc/src/bitmap.c
@@ -1,5 +1,5 @@
 /* ----------------------------------------------------------------------------
-Copyright (c) 2019-2021 Microsoft Research, Daan Leijen
+Copyright (c) 2019-2023 Microsoft Research, Daan Leijen
 This is free software; you can redistribute it and/or modify it under the
 terms of the MIT license. A copy of the license can be found in the file
 "LICENSE" at the root of this distribution.
@@ -7,18 +7,17 @@ terms of the MIT license. A copy of the license can be found in the file
 
 /* ----------------------------------------------------------------------------
 Concurrent bitmap that can set/reset sequences of bits atomically,
-represeted as an array of fields where each field is a machine word (`uintptr_t`)
+represented as an array of fields where each field is a machine word (`size_t`)
 
 There are two api's; the standard one cannot have sequences that cross
 between the bitmap fields (and a sequence must be <= MI_BITMAP_FIELD_BITS).
-(this is used in region allocation)
 
 The `_across` postfixed functions do allow sequences that can cross over
 between the fields. (This is used in arena allocation)
 ---------------------------------------------------------------------------- */
 
 #include "mimalloc.h"
-#include "mimalloc-internal.h"
+#include "mimalloc/internal.h"
 #include "bitmap.h"
 
 /* -----------------------------------------------------------
@@ -26,12 +25,12 @@ between the fields. (This is used in arena allocation)
 ----------------------------------------------------------- */
 
 // The bit mask for a given number of blocks at a specified bit index.
-static inline uintptr_t mi_bitmap_mask_(size_t count, size_t bitidx) {
+static inline size_t mi_bitmap_mask_(size_t count, size_t bitidx) {
   mi_assert_internal(count + bitidx <= MI_BITMAP_FIELD_BITS);
   mi_assert_internal(count > 0);
   if (count >= MI_BITMAP_FIELD_BITS) return MI_BITMAP_FIELD_FULL;
   if (count == 0) return 0;
-  return ((((uintptr_t)1 << count) - 1) << bitidx);
+  return ((((size_t)1 << count) - 1) << bitidx);
 }
 
 
@@ -46,29 +45,29 @@ bool _mi_bitmap_try_find_claim_field(mi_bitmap_t bitmap, size_t idx, const size_
 {
   mi_assert_internal(bitmap_idx != NULL);
   mi_assert_internal(count <= MI_BITMAP_FIELD_BITS);
-  _Atomic(uintptr_t)* field = &bitmap[idx];
-  uintptr_t map  = mi_atomic_load_relaxed(field);
+  mi_bitmap_field_t* field = &bitmap[idx];
+  size_t map  = mi_atomic_load_relaxed(field);
   if (map==MI_BITMAP_FIELD_FULL) return false; // short cut
 
   // search for 0-bit sequence of length count
-  const uintptr_t mask = mi_bitmap_mask_(count, 0);
-  const size_t    bitidx_max = MI_BITMAP_FIELD_BITS - count;
+  const size_t mask = mi_bitmap_mask_(count, 0);
+  const size_t bitidx_max = MI_BITMAP_FIELD_BITS - count;
 
 #ifdef MI_HAVE_FAST_BITSCAN
   size_t bitidx = mi_ctz(~map);    // quickly find the first zero bit if possible
 #else
   size_t bitidx = 0;               // otherwise start at 0
 #endif
-  uintptr_t m = (mask << bitidx);     // invariant: m == mask shifted by bitidx
+  size_t m = (mask << bitidx);     // invariant: m == mask shifted by bitidx
 
   // scan linearly for a free range of zero bits
   while (bitidx <= bitidx_max) {
-    const uintptr_t mapm = map & m;
+    const size_t mapm = (map & m);
     if (mapm == 0) {  // are the mask bits free at bitidx?
       mi_assert_internal((m >> bitidx) == mask); // no overflow?
-      const uintptr_t newmap = map | m;
+      const size_t newmap = (map | m);
       mi_assert_internal((newmap^map) >> bitidx == mask);
-      if (!mi_atomic_cas_weak_acq_rel(field, &map, newmap)) {  // TODO: use strong cas here?
+      if (!mi_atomic_cas_strong_acq_rel(field, &map, newmap)) {  // TODO: use weak cas here?
         // no success, another thread claimed concurrently.. keep going (with updated `map`)
         continue;
       }
@@ -81,7 +80,8 @@ bool _mi_bitmap_try_find_claim_field(mi_bitmap_t bitmap, size_t idx, const size_
     else {
       // on to the next bit range
 #ifdef MI_HAVE_FAST_BITSCAN
-      const size_t shift = (count == 1 ? 1 : mi_bsr(mapm) - bitidx + 1);
+      mi_assert_internal(mapm != 0);
+      const size_t shift = (count == 1 ? 1 : (MI_INTPTR_BITS - mi_clz(mapm) - bitidx));
       mi_assert_internal(shift > 0 && shift <= count);
 #else
       const size_t shift = 1;
@@ -100,7 +100,7 @@ bool _mi_bitmap_try_find_claim_field(mi_bitmap_t bitmap, size_t idx, const size_
 bool _mi_bitmap_try_find_from_claim(mi_bitmap_t bitmap, const size_t bitmap_fields, const size_t start_field_idx, const size_t count, mi_bitmap_index_t* bitmap_idx) {
   size_t idx = start_field_idx;
   for (size_t visited = 0; visited < bitmap_fields; visited++, idx++) {
-    if (idx >= bitmap_fields) idx = 0; // wrap
+    if (idx >= bitmap_fields) { idx = 0; } // wrap
     if (_mi_bitmap_try_find_claim_field(bitmap, idx, count, bitmap_idx)) {
       return true;
     }
@@ -108,23 +108,16 @@ bool _mi_bitmap_try_find_from_claim(mi_bitmap_t bitmap, const size_t bitmap_fiel
   return false;
 }
 
-/*
-// Find `count` bits of 0 and set them to 1 atomically; returns `true` on success.
-// For now, `count` can be at most MI_BITMAP_FIELD_BITS and will never span fields.
-bool _mi_bitmap_try_find_claim(mi_bitmap_t bitmap, const size_t bitmap_fields, const size_t count, mi_bitmap_index_t* bitmap_idx) {
-  return _mi_bitmap_try_find_from_claim(bitmap, bitmap_fields, 0, count, bitmap_idx);
-}
-*/
 
 // Set `count` bits at `bitmap_idx` to 0 atomically
 // Returns `true` if all `count` bits were 1 previously.
-bool mi_bitmap_unclaim(mi_bitmap_t bitmap, size_t bitmap_fields, size_t count, mi_bitmap_index_t bitmap_idx) {
+bool _mi_bitmap_unclaim(mi_bitmap_t bitmap, size_t bitmap_fields, size_t count, mi_bitmap_index_t bitmap_idx) {
   const size_t idx = mi_bitmap_index_field(bitmap_idx);
   const size_t bitidx = mi_bitmap_index_bit_in_field(bitmap_idx);
-  const uintptr_t mask = mi_bitmap_mask_(count, bitidx);
-  mi_assert_internal(bitmap_fields > idx); UNUSED(bitmap_fields);
+  const size_t mask = mi_bitmap_mask_(count, bitidx);
+  mi_assert_internal(bitmap_fields > idx); MI_UNUSED(bitmap_fields);
   // mi_assert_internal((bitmap[idx] & mask) == mask);
-  uintptr_t prev = mi_atomic_and_acq_rel(&bitmap[idx], ~mask);
+  const size_t prev = mi_atomic_and_acq_rel(&bitmap[idx], ~mask);
   return ((prev & mask) == mask);
 }
 
@@ -134,11 +127,11 @@ bool mi_bitmap_unclaim(mi_bitmap_t bitmap, size_t bitmap_fields, size_t count, m
 bool _mi_bitmap_claim(mi_bitmap_t bitmap, size_t bitmap_fields, size_t count, mi_bitmap_index_t bitmap_idx, bool* any_zero) {
   const size_t idx = mi_bitmap_index_field(bitmap_idx);
   const size_t bitidx = mi_bitmap_index_bit_in_field(bitmap_idx);
-  const uintptr_t mask = mi_bitmap_mask_(count, bitidx);
-  mi_assert_internal(bitmap_fields > idx); UNUSED(bitmap_fields);
+  const size_t mask = mi_bitmap_mask_(count, bitidx);
+  mi_assert_internal(bitmap_fields > idx); MI_UNUSED(bitmap_fields);
   //mi_assert_internal(any_zero != NULL || (bitmap[idx] & mask) == 0);
-  uintptr_t prev = mi_atomic_or_acq_rel(&bitmap[idx], mask);
-  if (any_zero != NULL) *any_zero = ((prev & mask) != mask);
+  size_t prev = mi_atomic_or_acq_rel(&bitmap[idx], mask);
+  if (any_zero != NULL) { *any_zero = ((prev & mask) != mask); }
   return ((prev & mask) == 0);
 }
 
@@ -146,13 +139,30 @@ bool _mi_bitmap_claim(mi_bitmap_t bitmap, size_t bitmap_fields, size_t count, mi
 static bool mi_bitmap_is_claimedx(mi_bitmap_t bitmap, size_t bitmap_fields, size_t count, mi_bitmap_index_t bitmap_idx, bool* any_ones) {
   const size_t idx = mi_bitmap_index_field(bitmap_idx);
   const size_t bitidx = mi_bitmap_index_bit_in_field(bitmap_idx);
-  const uintptr_t mask = mi_bitmap_mask_(count, bitidx);
-  mi_assert_internal(bitmap_fields > idx); UNUSED(bitmap_fields);
-  uintptr_t field = mi_atomic_load_relaxed(&bitmap[idx]);
-  if (any_ones != NULL) *any_ones = ((field & mask) != 0);
+  const size_t mask = mi_bitmap_mask_(count, bitidx);
+  mi_assert_internal(bitmap_fields > idx); MI_UNUSED(bitmap_fields);
+  const size_t field = mi_atomic_load_relaxed(&bitmap[idx]);
+  if (any_ones != NULL) { *any_ones = ((field & mask) != 0); }
   return ((field & mask) == mask);
 }
 
+// Try to set `count` bits at `bitmap_idx` from 0 to 1 atomically. 
+// Returns `true` if successful when all previous `count` bits were 0.
+bool _mi_bitmap_try_claim(mi_bitmap_t bitmap, size_t bitmap_fields, size_t count, mi_bitmap_index_t bitmap_idx) {
+  const size_t idx = mi_bitmap_index_field(bitmap_idx);
+  const size_t bitidx = mi_bitmap_index_bit_in_field(bitmap_idx);
+  const size_t mask = mi_bitmap_mask_(count, bitidx);
+  mi_assert_internal(bitmap_fields > idx); MI_UNUSED(bitmap_fields);
+  size_t expected = mi_atomic_load_relaxed(&bitmap[idx]);
+  do  {    
+    if ((expected & mask) != 0) return false;
+  } 
+  while (!mi_atomic_cas_strong_acq_rel(&bitmap[idx], &expected, expected | mask));
+  mi_assert_internal((expected & mask) == 0);
+  return true;
+}
+
+
 bool _mi_bitmap_is_claimed(mi_bitmap_t bitmap, size_t bitmap_fields, size_t count, mi_bitmap_index_t bitmap_idx) {
   return mi_bitmap_is_claimedx(bitmap, bitmap_fields, count, bitmap_idx, NULL);
 }
@@ -169,87 +179,93 @@ bool _mi_bitmap_is_any_claimed(mi_bitmap_t bitmap, size_t bitmap_fields, size_t
 // between the fields. This is used in arena allocation
 //--------------------------------------------------------------------------
 
-// Try to atomically claim a sequence of `count` bits starting from the field 
+// Try to atomically claim a sequence of `count` bits starting from the field
 // at `idx` in `bitmap` and crossing into subsequent fields. Returns `true` on success.
-static bool mi_bitmap_try_find_claim_field_across(mi_bitmap_t bitmap, size_t bitmap_fields, size_t idx, const size_t count, const size_t retries, mi_bitmap_index_t* bitmap_idx)
+// Only needs to consider crossing into the next fields (see `mi_bitmap_try_find_from_claim_across`)
+static bool mi_bitmap_try_find_claim_field_across(mi_bitmap_t bitmap, size_t bitmap_fields, size_t idx, const size_t count, const size_t retries, mi_bitmap_index_t* bitmap_idx, mi_stats_t* stats)
 {
   mi_assert_internal(bitmap_idx != NULL);
-  
+
   // check initial trailing zeros
-  _Atomic(uintptr_t)* field = &bitmap[idx];
-  uintptr_t map = mi_atomic_load_relaxed(field);  
+  mi_bitmap_field_t* field = &bitmap[idx];
+  size_t map = mi_atomic_load_relaxed(field);
   const size_t initial = mi_clz(map);  // count of initial zeros starting at idx
   mi_assert_internal(initial <= MI_BITMAP_FIELD_BITS);
   if (initial == 0)     return false;
-  if (initial >= count) return _mi_bitmap_try_find_claim_field(bitmap, idx, count, bitmap_idx);     // no need to cross fields
+  if (initial >= count) return _mi_bitmap_try_find_claim_field(bitmap, idx, count, bitmap_idx);    // no need to cross fields (this case won't happen for us)
   if (_mi_divide_up(count - initial, MI_BITMAP_FIELD_BITS) >= (bitmap_fields - idx)) return false; // not enough entries
-
+  
   // scan ahead
   size_t found = initial;
-  uintptr_t mask = 0;     // mask bits for the final field
+  size_t mask = 0;     // mask bits for the final field
   while(found < count) {
     field++;
     map = mi_atomic_load_relaxed(field);
-    const uintptr_t mask_bits = (found + MI_BITMAP_FIELD_BITS <= count ? MI_BITMAP_FIELD_BITS : (count - found));
+    const size_t mask_bits = (found + MI_BITMAP_FIELD_BITS <= count ? MI_BITMAP_FIELD_BITS : (count - found));
+    mi_assert_internal(mask_bits > 0 && mask_bits <= MI_BITMAP_FIELD_BITS);
     mask = mi_bitmap_mask_(mask_bits, 0);
-    if ((map & mask) != 0) return false;
+    if ((map & mask) != 0) return false;  // some part is already claimed
     found += mask_bits;
   }
   mi_assert_internal(field < &bitmap[bitmap_fields]);
 
-  // found range of zeros up to the final field; mask contains mask in the final field
-  // now claim it atomically
-  _Atomic(uintptr_t)* const final_field = field;
-  const uintptr_t final_mask = mask;
-  _Atomic(uintptr_t)* const initial_field = &bitmap[idx];
-  const uintptr_t initial_mask = mi_bitmap_mask_(initial, MI_BITMAP_FIELD_BITS - initial);
+  // we found a range of contiguous zeros up to the final field; mask contains mask in the final field
+  // now try to claim the range atomically
+  mi_bitmap_field_t* const final_field = field;
+  const size_t final_mask = mask;
+  mi_bitmap_field_t* const initial_field = &bitmap[idx];
+  const size_t initial_idx = MI_BITMAP_FIELD_BITS - initial;
+  const size_t initial_mask = mi_bitmap_mask_(initial, initial_idx);
 
   // initial field
-  uintptr_t newmap;
+  size_t newmap;
   field = initial_field;
   map = mi_atomic_load_relaxed(field);
   do {
-    newmap = map | initial_mask;
+    newmap = (map | initial_mask);
     if ((map & initial_mask) != 0) { goto rollback; };
   } while (!mi_atomic_cas_strong_acq_rel(field, &map, newmap));
-  
+
   // intermediate fields
   while (++field < final_field) {
     newmap = mi_bitmap_mask_(MI_BITMAP_FIELD_BITS, 0);
     map = 0;
     if (!mi_atomic_cas_strong_acq_rel(field, &map, newmap)) { goto rollback; }
   }
-  
+
   // final field
   mi_assert_internal(field == final_field);
   map = mi_atomic_load_relaxed(field);
   do {
-    newmap = map | final_mask;
+    newmap = (map | final_mask);
     if ((map & final_mask) != 0) { goto rollback; }
   } while (!mi_atomic_cas_strong_acq_rel(field, &map, newmap));
 
   // claimed!
-  *bitmap_idx = mi_bitmap_index_create(idx, MI_BITMAP_FIELD_BITS - initial);
+  mi_stat_counter_increase(stats->arena_crossover_count,1);
+  *bitmap_idx = mi_bitmap_index_create(idx, initial_idx);
   return true;
 
-rollback: 
+rollback:
   // roll back intermediate fields
+  // (we just failed to claim `field` so decrement first)
   while (--field > initial_field) {
     newmap = 0;
     map = mi_bitmap_mask_(MI_BITMAP_FIELD_BITS, 0);
     mi_assert_internal(mi_atomic_load_relaxed(field) == map);
     mi_atomic_store_release(field, newmap);
   }
-  if (field == initial_field) {
+  if (field == initial_field) {               // (if we failed on the initial field, `field + 1 == initial_field`)
     map = mi_atomic_load_relaxed(field);
     do {
       mi_assert_internal((map & initial_mask) == initial_mask);
-      newmap = map & ~initial_mask;
+      newmap = (map & ~initial_mask);
     } while (!mi_atomic_cas_strong_acq_rel(field, &map, newmap));
-  }  
+  }
+  mi_stat_counter_increase(stats->arena_rollback_count,1);
   // retry? (we make a recursive call instead of goto to be able to use const declarations)
-  if (retries < 4) {
-    return mi_bitmap_try_find_claim_field_across(bitmap, bitmap_fields, idx, count, retries+1, bitmap_idx);
+  if (retries <= 2) {
+    return mi_bitmap_try_find_claim_field_across(bitmap, bitmap_fields, idx, count, retries+1, bitmap_idx, stats);
   }
   else {
     return false;
@@ -259,20 +275,27 @@ static bool mi_bitmap_try_find_claim_field_across(mi_bitmap_t bitmap, size_t bit
 
 // Find `count` bits of zeros and set them to 1 atomically; returns `true` on success.
 // Starts at idx, and wraps around to search in all `bitmap_fields` fields.
-bool _mi_bitmap_try_find_from_claim_across(mi_bitmap_t bitmap, const size_t bitmap_fields, const size_t start_field_idx, const size_t count, mi_bitmap_index_t* bitmap_idx) {
+bool _mi_bitmap_try_find_from_claim_across(mi_bitmap_t bitmap, const size_t bitmap_fields, const size_t start_field_idx, const size_t count, mi_bitmap_index_t* bitmap_idx, mi_stats_t* stats) {
   mi_assert_internal(count > 0);
-  if (count==1) return _mi_bitmap_try_find_from_claim(bitmap, bitmap_fields, start_field_idx, count, bitmap_idx);
+  if (count <= 2) {
+    // we don't bother with crossover fields for small counts
+    return _mi_bitmap_try_find_from_claim(bitmap, bitmap_fields, start_field_idx, count, bitmap_idx);
+  }
+
+  // visit the fields
   size_t idx = start_field_idx;
   for (size_t visited = 0; visited < bitmap_fields; visited++, idx++) {
-    if (idx >= bitmap_fields) idx = 0; // wrap
-    // try to claim inside the field
+    if (idx >= bitmap_fields) { idx = 0; } // wrap
+    // first try to claim inside a field
+    /*
     if (count <= MI_BITMAP_FIELD_BITS) {
       if (_mi_bitmap_try_find_claim_field(bitmap, idx, count, bitmap_idx)) {
         return true;
       }
     }
-    // try to claim across fields
-    if (mi_bitmap_try_find_claim_field_across(bitmap, bitmap_fields, idx, count, 0, bitmap_idx)) {
+    */
+    // if that fails, then try to claim across fields
+    if (mi_bitmap_try_find_claim_field_across(bitmap, bitmap_fields, idx, count, 0, bitmap_idx, stats)) {
       return true;
     }
   }
@@ -280,10 +303,10 @@ bool _mi_bitmap_try_find_from_claim_across(mi_bitmap_t bitmap, const size_t bitm
 }
 
 // Helper for masks across fields; returns the mid count, post_mask may be 0
-static size_t mi_bitmap_mask_across(mi_bitmap_index_t bitmap_idx, size_t bitmap_fields, size_t count, uintptr_t* pre_mask, uintptr_t* mid_mask, uintptr_t* post_mask) {
-  UNUSED_RELEASE(bitmap_fields);
+static size_t mi_bitmap_mask_across(mi_bitmap_index_t bitmap_idx, size_t bitmap_fields, size_t count, size_t* pre_mask, size_t* mid_mask, size_t* post_mask) {
+  MI_UNUSED(bitmap_fields);
   const size_t bitidx = mi_bitmap_index_bit_in_field(bitmap_idx);
-  if (mi_likely(bitidx + count <= MI_BITMAP_FIELD_BITS)) {
+  if mi_likely(bitidx + count <= MI_BITMAP_FIELD_BITS) {
     *pre_mask = mi_bitmap_mask_(count, bitidx);
     *mid_mask = 0;
     *post_mask = 0;
@@ -308,37 +331,37 @@ static size_t mi_bitmap_mask_across(mi_bitmap_index_t bitmap_idx, size_t bitmap_
 // Returns `true` if all `count` bits were 1 previously.
 bool _mi_bitmap_unclaim_across(mi_bitmap_t bitmap, size_t bitmap_fields, size_t count, mi_bitmap_index_t bitmap_idx) {
   size_t idx = mi_bitmap_index_field(bitmap_idx);
-  uintptr_t pre_mask;
-  uintptr_t mid_mask;
-  uintptr_t post_mask;
-  size_t mid_count = mi_bitmap_mask_across(bitmap_idx, bitmap_fields, count, &pre_mask, &mid_mask, &post_mask);  
+  size_t pre_mask;
+  size_t mid_mask;
+  size_t post_mask;
+  size_t mid_count = mi_bitmap_mask_across(bitmap_idx, bitmap_fields, count, &pre_mask, &mid_mask, &post_mask);
   bool all_one = true;
-  _Atomic(uintptr_t)*field = &bitmap[idx];
-  uintptr_t prev = mi_atomic_and_acq_rel(field++, ~pre_mask);
+  mi_bitmap_field_t* field = &bitmap[idx];
+  size_t prev = mi_atomic_and_acq_rel(field++, ~pre_mask);   // clear first part
   if ((prev & pre_mask) != pre_mask) all_one = false;
   while(mid_count-- > 0) {
-    prev = mi_atomic_and_acq_rel(field++, ~mid_mask);
+    prev = mi_atomic_and_acq_rel(field++, ~mid_mask);        // clear mid part
     if ((prev & mid_mask) != mid_mask) all_one = false;
   }
   if (post_mask!=0) {
-    prev = mi_atomic_and_acq_rel(field, ~post_mask);
+    prev = mi_atomic_and_acq_rel(field, ~post_mask);         // clear end part
     if ((prev & post_mask) != post_mask) all_one = false;
   }
-  return all_one;  
+  return all_one;
 }
 
 // Set `count` bits at `bitmap_idx` to 1 atomically
 // Returns `true` if all `count` bits were 0 previously. `any_zero` is `true` if there was at least one zero bit.
 bool _mi_bitmap_claim_across(mi_bitmap_t bitmap, size_t bitmap_fields, size_t count, mi_bitmap_index_t bitmap_idx, bool* pany_zero) {
   size_t idx = mi_bitmap_index_field(bitmap_idx);
-  uintptr_t pre_mask;
-  uintptr_t mid_mask;
-  uintptr_t post_mask;
+  size_t pre_mask;
+  size_t mid_mask;
+  size_t post_mask;
   size_t mid_count = mi_bitmap_mask_across(bitmap_idx, bitmap_fields, count, &pre_mask, &mid_mask, &post_mask);
   bool all_zero = true;
   bool any_zero = false;
-  _Atomic(uintptr_t)*field = &bitmap[idx];
-  uintptr_t prev = mi_atomic_or_acq_rel(field++, pre_mask);
+  _Atomic(size_t)*field = &bitmap[idx];
+  size_t prev = mi_atomic_or_acq_rel(field++, pre_mask);
   if ((prev & pre_mask) != 0) all_zero = false;
   if ((prev & pre_mask) != pre_mask) any_zero = true;
   while (mid_count-- > 0) {
@@ -351,23 +374,23 @@ bool _mi_bitmap_claim_across(mi_bitmap_t bitmap, size_t bitmap_fields, size_t co
     if ((prev & post_mask) != 0) all_zero = false;
     if ((prev & post_mask) != post_mask) any_zero = true;
   }
-  if (pany_zero != NULL) *pany_zero = any_zero;
+  if (pany_zero != NULL) { *pany_zero = any_zero; }
   return all_zero;
 }
 
 
-// Returns `true` if all `count` bits were 1. 
+// Returns `true` if all `count` bits were 1.
 // `any_ones` is `true` if there was at least one bit set to one.
 static bool mi_bitmap_is_claimedx_across(mi_bitmap_t bitmap, size_t bitmap_fields, size_t count, mi_bitmap_index_t bitmap_idx, bool* pany_ones) {
   size_t idx = mi_bitmap_index_field(bitmap_idx);
-  uintptr_t pre_mask;
-  uintptr_t mid_mask;
-  uintptr_t post_mask;
+  size_t pre_mask;
+  size_t mid_mask;
+  size_t post_mask;
   size_t mid_count = mi_bitmap_mask_across(bitmap_idx, bitmap_fields, count, &pre_mask, &mid_mask, &post_mask);
   bool all_ones = true;
   bool any_ones = false;
-  _Atomic(uintptr_t)* field = &bitmap[idx];
-  uintptr_t prev = mi_atomic_load_relaxed(field++);
+  mi_bitmap_field_t* field = &bitmap[idx];
+  size_t prev = mi_atomic_load_relaxed(field++);
   if ((prev & pre_mask) != pre_mask) all_ones = false;
   if ((prev & pre_mask) != 0) any_ones = true;
   while (mid_count-- > 0) {
@@ -379,8 +402,8 @@ static bool mi_bitmap_is_claimedx_across(mi_bitmap_t bitmap, size_t bitmap_field
     prev = mi_atomic_load_relaxed(field);
     if ((prev & post_mask) != post_mask) all_ones = false;
     if ((prev & post_mask) != 0) any_ones = true;
-  }  
-  if (pany_ones != NULL) *pany_ones = any_ones;
+  }
+  if (pany_ones != NULL) { *pany_ones = any_ones; }
   return all_ones;
 }
 
diff --git a/contrib/libs/mimalloc/src/bitmap.h b/contrib/libs/mimalloc/src/bitmap.h
index 21fd4e13d07b..a1e7686abc61 100644
--- a/contrib/libs/mimalloc/src/bitmap.h
+++ b/contrib/libs/mimalloc/src/bitmap.h
@@ -1,5 +1,5 @@
 /* ----------------------------------------------------------------------------
-Copyright (c) 2019-2020 Microsoft Research, Daan Leijen
+Copyright (c) 2019-2023 Microsoft Research, Daan Leijen
 This is free software; you can redistribute it and/or modify it under the
 terms of the MIT license. A copy of the license can be found in the file
 "LICENSE" at the root of this distribution.
@@ -7,7 +7,7 @@ terms of the MIT license. A copy of the license can be found in the file
 
 /* ----------------------------------------------------------------------------
 Concurrent bitmap that can set/reset sequences of bits atomically,
-represeted as an array of fields where each field is a machine word (`uintptr_t`)
+represented as an array of fields where each field is a machine word (`size_t`)
 
 There are two api's; the standard one cannot have sequences that cross
 between the bitmap fields (and a sequence must be <= MI_BITMAP_FIELD_BITS).
@@ -24,11 +24,11 @@ between the fields. (This is used in arena allocation)
   Bitmap definition
 ----------------------------------------------------------- */
 
-#define MI_BITMAP_FIELD_BITS   (8*MI_INTPTR_SIZE)
-#define MI_BITMAP_FIELD_FULL   (~((uintptr_t)0))   // all bits set
+#define MI_BITMAP_FIELD_BITS   (8*MI_SIZE_SIZE)
+#define MI_BITMAP_FIELD_FULL   (~((size_t)0))   // all bits set
 
-// An atomic bitmap of `uintptr_t` fields
-typedef _Atomic(uintptr_t)  mi_bitmap_field_t;
+// An atomic bitmap of `size_t` fields
+typedef _Atomic(size_t)  mi_bitmap_field_t;
 typedef mi_bitmap_field_t*  mi_bitmap_t;
 
 // A bitmap index is the index of the bit in a bitmap.
@@ -69,7 +69,11 @@ bool _mi_bitmap_try_find_from_claim(mi_bitmap_t bitmap, const size_t bitmap_fiel
 
 // Set `count` bits at `bitmap_idx` to 0 atomically
 // Returns `true` if all `count` bits were 1 previously.
-bool mi_bitmap_unclaim(mi_bitmap_t bitmap, size_t bitmap_fields, size_t count, mi_bitmap_index_t bitmap_idx);
+bool _mi_bitmap_unclaim(mi_bitmap_t bitmap, size_t bitmap_fields, size_t count, mi_bitmap_index_t bitmap_idx);
+
+// Try to set `count` bits at `bitmap_idx` from 0 to 1 atomically. 
+// Returns `true` if successful when all previous `count` bits were 0.
+bool _mi_bitmap_try_claim(mi_bitmap_t bitmap, size_t bitmap_fields, size_t count, mi_bitmap_index_t bitmap_idx);
 
 // Set `count` bits at `bitmap_idx` to 1 atomically
 // Returns `true` if all `count` bits were 0 previously. `any_zero` is `true` if there was at least one zero bit.
@@ -86,7 +90,7 @@ bool _mi_bitmap_is_any_claimed(mi_bitmap_t bitmap, size_t bitmap_fields, size_t
 
 // Find `count` bits of zeros and set them to 1 atomically; returns `true` on success.
 // Starts at idx, and wraps around to search in all `bitmap_fields` fields.
-bool _mi_bitmap_try_find_from_claim_across(mi_bitmap_t bitmap, const size_t bitmap_fields, const size_t start_field_idx, const size_t count, mi_bitmap_index_t* bitmap_idx);
+bool _mi_bitmap_try_find_from_claim_across(mi_bitmap_t bitmap, const size_t bitmap_fields, const size_t start_field_idx, const size_t count, mi_bitmap_index_t* bitmap_idx, mi_stats_t* stats);
 
 // Set `count` bits at `bitmap_idx` to 0 atomically
 // Returns `true` if all `count` bits were 1 previously.
diff --git a/contrib/libs/mimalloc/src/free.c b/contrib/libs/mimalloc/src/free.c
new file mode 100644
index 000000000000..c065d2f3f666
--- /dev/null
+++ b/contrib/libs/mimalloc/src/free.c
@@ -0,0 +1,520 @@
+/* ----------------------------------------------------------------------------
+Copyright (c) 2018-2024, Microsoft Research, Daan Leijen
+This is free software; you can redistribute it and/or modify it under the
+terms of the MIT license. A copy of the license can be found in the file
+"LICENSE" at the root of this distribution.
+-----------------------------------------------------------------------------*/
+#if !defined(MI_IN_ALLOC_C)
+#error "this file should be included from 'alloc.c' (so aliases can work from alloc-override)"
+// add includes help an IDE
+#include "mimalloc.h"
+#include "mimalloc/internal.h"
+#include "mimalloc/atomic.h"
+#include "mimalloc/prim.h"   // _mi_prim_thread_id()
+#endif
+
+// forward declarations
+static void   mi_check_padding(const mi_page_t* page, const mi_block_t* block);
+static bool   mi_check_is_double_free(const mi_page_t* page, const mi_block_t* block);
+static size_t mi_page_usable_size_of(const mi_page_t* page, const mi_block_t* block);
+static void   mi_stat_free(const mi_page_t* page, const mi_block_t* block);
+
+
+// ------------------------------------------------------
+// Free
+// ------------------------------------------------------
+
+// forward declaration of multi-threaded free (`_mt`) (or free in huge block if compiled with MI_HUGE_PAGE_ABANDON)
+static mi_decl_noinline void mi_free_block_mt(mi_page_t* page, mi_segment_t* segment, mi_block_t* block);
+
+// regular free of a (thread local) block pointer
+// fast path written carefully to prevent spilling on the stack
+static inline void mi_free_block_local(mi_page_t* page, mi_block_t* block, bool track_stats, bool check_full)
+{
+  // checks
+  if mi_unlikely(mi_check_is_double_free(page, block)) return;
+  mi_check_padding(page, block);
+  if (track_stats) { mi_stat_free(page, block); }
+  #if (MI_DEBUG>0) && !MI_TRACK_ENABLED  && !MI_TSAN
+  memset(block, MI_DEBUG_FREED, mi_page_block_size(page));
+  #endif
+  if (track_stats) { mi_track_free_size(block, mi_page_usable_size_of(page, block)); } // faster then mi_usable_size as we already know the page and that p is unaligned
+  
+  // actual free: push on the local free list
+  mi_block_set_next(page, block, page->local_free);
+  page->local_free = block;
+  if mi_unlikely(--page->used == 0) {
+    _mi_page_retire(page);
+  }
+  else if mi_unlikely(check_full && mi_page_is_in_full(page)) {
+    _mi_page_unfull(page);
+  }
+}
+
+// Adjust a block that was allocated aligned, to the actual start of the block in the page.
+// note: this can be called from `mi_free_generic_mt` where a non-owning thread accesses the 
+// `page_start` and `block_size` fields; however these are constant and the page won't be 
+// deallocated (as the block we are freeing keeps it alive) and thus safe to read concurrently.
+mi_block_t* _mi_page_ptr_unalign(const mi_page_t* page, const void* p) {
+  mi_assert_internal(page!=NULL && p!=NULL);
+
+  size_t diff = (uint8_t*)p - page->page_start;
+  size_t adjust;
+  if mi_likely(page->block_size_shift != 0) {
+    adjust = diff & (((size_t)1 << page->block_size_shift) - 1);
+  }
+  else {
+    adjust = diff % mi_page_block_size(page);
+  }
+
+  return (mi_block_t*)((uintptr_t)p - adjust);
+}
+
+// free a local pointer  (page parameter comes first for better codegen)
+static void mi_decl_noinline mi_free_generic_local(mi_page_t* page, mi_segment_t* segment, void* p) mi_attr_noexcept {
+  MI_UNUSED(segment);
+  mi_block_t* const block = (mi_page_has_aligned(page) ? _mi_page_ptr_unalign(page, p) : (mi_block_t*)p);
+  mi_free_block_local(page, block, true /* track stats */, true /* check for a full page */);
+}
+
+// free a pointer owned by another thread (page parameter comes first for better codegen)
+static void mi_decl_noinline mi_free_generic_mt(mi_page_t* page, mi_segment_t* segment, void* p) mi_attr_noexcept {
+  mi_block_t* const block = _mi_page_ptr_unalign(page, p); // don't check `has_aligned` flag to avoid a race (issue #865)
+  mi_free_block_mt(page, segment, block);
+}
+
+// generic free (for runtime integration)
+void mi_decl_noinline _mi_free_generic(mi_segment_t* segment, mi_page_t* page, bool is_local, void* p) mi_attr_noexcept {
+  if (is_local) mi_free_generic_local(page,segment,p);
+           else mi_free_generic_mt(page,segment,p);
+}
+
+// Get the segment data belonging to a pointer
+// This is just a single `and` in release mode but does further checks in debug mode
+// (and secure mode) to see if this was a valid pointer.
+static inline mi_segment_t* mi_checked_ptr_segment(const void* p, const char* msg)
+{
+  MI_UNUSED(msg);
+
+#if (MI_DEBUG>0)
+  if mi_unlikely(((uintptr_t)p & (MI_INTPTR_SIZE - 1)) != 0) {
+    _mi_error_message(EINVAL, "%s: invalid (unaligned) pointer: %p\n", msg, p);
+    return NULL;
+  }
+#endif
+
+  mi_segment_t* const segment = _mi_ptr_segment(p);
+  if mi_unlikely(segment==NULL) return segment;
+
+#if (MI_DEBUG>0)
+  if mi_unlikely(!mi_is_in_heap_region(p)) {
+    _mi_warning_message("%s: pointer might not point to a valid heap region: %p\n"
+      "(this may still be a valid very large allocation (over 64MiB))\n", msg, p);
+    if mi_likely(_mi_ptr_cookie(segment) == segment->cookie) {
+      _mi_warning_message("(yes, the previous pointer %p was valid after all)\n", p);
+    }
+  }
+#endif
+#if (MI_DEBUG>0 || MI_SECURE>=4)
+  if mi_unlikely(_mi_ptr_cookie(segment) != segment->cookie) {
+    _mi_error_message(EINVAL, "%s: pointer does not point to a valid heap space: %p\n", msg, p);
+    return NULL;
+  }
+#endif
+
+  return segment;
+}
+
+// Free a block
+// Fast path written carefully to prevent register spilling on the stack
+void mi_free(void* p) mi_attr_noexcept
+{
+  mi_segment_t* const segment = mi_checked_ptr_segment(p,"mi_free");
+  if mi_unlikely(segment==NULL) return;
+
+  const bool is_local = (_mi_prim_thread_id() == mi_atomic_load_relaxed(&segment->thread_id));
+  mi_page_t* const page = _mi_segment_page_of(segment, p);
+
+  if mi_likely(is_local) {                        // thread-local free?
+    if mi_likely(page->flags.full_aligned == 0) { // and it is not a full page (full pages need to move from the full bin), nor has aligned blocks (aligned blocks need to be unaligned)
+      // thread-local, aligned, and not a full page
+      mi_block_t* const block = (mi_block_t*)p;
+      mi_free_block_local(page, block, true /* track stats */, false /* no need to check if the page is full */);
+    }
+    else {
+      // page is full or contains (inner) aligned blocks; use generic path
+      mi_free_generic_local(page, segment, p);
+    }
+  }
+  else {
+    // not thread-local; use generic path
+    mi_free_generic_mt(page, segment, p);
+  }
+}
+
+// return true if successful
+bool _mi_free_delayed_block(mi_block_t* block) {
+  // get segment and page
+  mi_assert_internal(block!=NULL);
+  const mi_segment_t* const segment = _mi_ptr_segment(block);
+  mi_assert_internal(_mi_ptr_cookie(segment) == segment->cookie);
+  mi_assert_internal(_mi_thread_id() == segment->thread_id);
+  mi_page_t* const page = _mi_segment_page_of(segment, block);
+
+  // Clear the no-delayed flag so delayed freeing is used again for this page.
+  // This must be done before collecting the free lists on this page -- otherwise
+  // some blocks may end up in the page `thread_free` list with no blocks in the
+  // heap `thread_delayed_free` list which may cause the page to be never freed!
+  // (it would only be freed if we happen to scan it in `mi_page_queue_find_free_ex`)
+  if (!_mi_page_try_use_delayed_free(page, MI_USE_DELAYED_FREE, false /* dont overwrite never delayed */)) {
+    return false;
+  }
+
+  // collect all other non-local frees (move from `thread_free` to `free`) to ensure up-to-date `used` count
+  _mi_page_free_collect(page, false);
+
+  // and free the block (possibly freeing the page as well since `used` is updated)
+  mi_free_block_local(page, block, false /* stats have already been adjusted */, true /* check for a full page */);
+  return true;
+}
+
+// ------------------------------------------------------
+// Multi-threaded Free (`_mt`)
+// ------------------------------------------------------
+
+// Push a block that is owned by another thread on its page-local thread free
+// list or it's heap delayed free list. Such blocks are later collected by
+// the owning thread in `_mi_free_delayed_block`.
+static void mi_decl_noinline mi_free_block_delayed_mt( mi_page_t* page, mi_block_t* block )
+{
+  // Try to put the block on either the page-local thread free list,
+  // or the heap delayed free list (if this is the first non-local free in that page)
+  mi_thread_free_t tfreex;
+  bool use_delayed;
+  mi_thread_free_t tfree = mi_atomic_load_relaxed(&page->xthread_free);
+  do {
+    use_delayed = (mi_tf_delayed(tfree) == MI_USE_DELAYED_FREE);
+    if mi_unlikely(use_delayed) {
+      // unlikely: this only happens on the first concurrent free in a page that is in the full list
+      tfreex = mi_tf_set_delayed(tfree,MI_DELAYED_FREEING);
+    }
+    else {
+      // usual: directly add to page thread_free list
+      mi_block_set_next(page, block, mi_tf_block(tfree));
+      tfreex = mi_tf_set_block(tfree,block);
+    }
+  } while (!mi_atomic_cas_weak_release(&page->xthread_free, &tfree, tfreex));
+
+  // If this was the first non-local free, we need to push it on the heap delayed free list instead
+  if mi_unlikely(use_delayed) {
+    // racy read on `heap`, but ok because MI_DELAYED_FREEING is set (see `mi_heap_delete` and `mi_heap_collect_abandon`)
+    mi_heap_t* const heap = (mi_heap_t*)(mi_atomic_load_acquire(&page->xheap)); //mi_page_heap(page);
+    mi_assert_internal(heap != NULL);
+    if (heap != NULL) {
+      // add to the delayed free list of this heap. (do this atomically as the lock only protects heap memory validity)
+      mi_block_t* dfree = mi_atomic_load_ptr_relaxed(mi_block_t, &heap->thread_delayed_free);
+      do {
+        mi_block_set_nextx(heap,block,dfree, heap->keys);
+      } while (!mi_atomic_cas_ptr_weak_release(mi_block_t,&heap->thread_delayed_free, &dfree, block));
+    }
+
+    // and reset the MI_DELAYED_FREEING flag
+    tfree = mi_atomic_load_relaxed(&page->xthread_free);
+    do {
+      tfreex = tfree;
+      mi_assert_internal(mi_tf_delayed(tfree) == MI_DELAYED_FREEING);
+      tfreex = mi_tf_set_delayed(tfree,MI_NO_DELAYED_FREE);
+    } while (!mi_atomic_cas_weak_release(&page->xthread_free, &tfree, tfreex));
+  }
+}
+
+// Multi-threaded free (`_mt`) (or free in huge block if compiled with MI_HUGE_PAGE_ABANDON)
+static void mi_decl_noinline mi_free_block_mt(mi_page_t* page, mi_segment_t* segment, mi_block_t* block)
+{
+  // first see if the segment was abandoned and if we can reclaim it into our thread
+  if (mi_option_is_enabled(mi_option_abandoned_reclaim_on_free) &&
+      #if MI_HUGE_PAGE_ABANDON
+      segment->page_kind != MI_PAGE_HUGE &&
+      #endif
+      mi_atomic_load_relaxed(&segment->thread_id) == 0)
+  {
+    // the segment is abandoned, try to reclaim it into our heap
+    if (_mi_segment_attempt_reclaim(mi_heap_get_default(), segment)) {
+      mi_assert_internal(_mi_prim_thread_id() == mi_atomic_load_relaxed(&segment->thread_id));
+      mi_free(block);  // recursively free as now it will be a local free in our heap
+      return;
+    }
+  }
+
+  // The padding check may access the non-thread-owned page for the key values.
+  // that is safe as these are constant and the page won't be freed (as the block is not freed yet).
+  mi_check_padding(page, block);
+
+  // adjust stats (after padding check and potentially recursive `mi_free` above)
+  mi_stat_free(page, block);    // stat_free may access the padding
+  mi_track_free_size(block, mi_page_usable_size_of(page,block));
+
+  // for small size, ensure we can fit the delayed thread pointers without triggering overflow detection
+  _mi_padding_shrink(page, block, sizeof(mi_block_t));
+
+  if (segment->page_kind == MI_PAGE_HUGE) {
+    #if MI_HUGE_PAGE_ABANDON
+    // huge page segments are always abandoned and can be freed immediately
+    _mi_segment_huge_page_free(segment, page, block);
+    return;
+    #else
+    // huge pages are special as they occupy the entire segment
+    // as these are large we reset the memory occupied by the page so it is available to other threads
+    // (as the owning thread needs to actually free the memory later).
+    _mi_segment_huge_page_reset(segment, page, block);
+    #endif
+  }
+  else {
+    #if (MI_DEBUG>0) && !MI_TRACK_ENABLED  && !MI_TSAN       // note: when tracking, cannot use mi_usable_size with multi-threading
+    memset(block, MI_DEBUG_FREED, mi_usable_size(block));
+    #endif
+  }
+
+  // and finally free the actual block by pushing it on the owning heap
+  // thread_delayed free list (or heap delayed free list)
+  mi_free_block_delayed_mt(page,block);
+}
+
+
+// ------------------------------------------------------
+// Usable size
+// ------------------------------------------------------
+
+// Bytes available in a block
+static size_t mi_decl_noinline mi_page_usable_aligned_size_of(const mi_page_t* page, const void* p) mi_attr_noexcept {
+  const mi_block_t* block = _mi_page_ptr_unalign(page, p);
+  const size_t size = mi_page_usable_size_of(page, block);
+  const ptrdiff_t adjust = (uint8_t*)p - (uint8_t*)block;
+  mi_assert_internal(adjust >= 0 && (size_t)adjust <= size);
+  return (size - adjust);
+}
+
+static inline size_t _mi_usable_size(const void* p, const char* msg) mi_attr_noexcept {
+  const mi_segment_t* const segment = mi_checked_ptr_segment(p, msg);
+  if mi_unlikely(segment==NULL) return 0;
+  const mi_page_t* const page = _mi_segment_page_of(segment, p);
+  if mi_likely(!mi_page_has_aligned(page)) {
+    const mi_block_t* block = (const mi_block_t*)p;
+    return mi_page_usable_size_of(page, block);
+  }
+  else {
+    // split out to separate routine for improved code generation
+    return mi_page_usable_aligned_size_of(page, p);
+  }
+}
+
+mi_decl_nodiscard size_t mi_usable_size(const void* p) mi_attr_noexcept {
+  return _mi_usable_size(p, "mi_usable_size");
+}
+
+
+// ------------------------------------------------------
+// Free variants
+// ------------------------------------------------------
+
+void mi_free_size(void* p, size_t size) mi_attr_noexcept {
+  MI_UNUSED_RELEASE(size);
+  mi_assert(p == NULL || size <= _mi_usable_size(p,"mi_free_size"));
+  mi_free(p);
+}
+
+void mi_free_size_aligned(void* p, size_t size, size_t alignment) mi_attr_noexcept {
+  MI_UNUSED_RELEASE(alignment);
+  mi_assert(((uintptr_t)p % alignment) == 0);
+  mi_free_size(p,size);
+}
+
+void mi_free_aligned(void* p, size_t alignment) mi_attr_noexcept {
+  MI_UNUSED_RELEASE(alignment);
+  mi_assert(((uintptr_t)p % alignment) == 0);
+  mi_free(p);
+}
+
+
+// ------------------------------------------------------
+// Check for double free in secure and debug mode
+// This is somewhat expensive so only enabled for secure mode 4
+// ------------------------------------------------------
+
+#if (MI_ENCODE_FREELIST && (MI_SECURE>=4 || MI_DEBUG!=0))
+// linear check if the free list contains a specific element
+static bool mi_list_contains(const mi_page_t* page, const mi_block_t* list, const mi_block_t* elem) {
+  while (list != NULL) {
+    if (elem==list) return true;
+    list = mi_block_next(page, list);
+  }
+  return false;
+}
+
+static mi_decl_noinline bool mi_check_is_double_freex(const mi_page_t* page, const mi_block_t* block) {
+  // The decoded value is in the same page (or NULL).
+  // Walk the free lists to verify positively if it is already freed
+  if (mi_list_contains(page, page->free, block) ||
+      mi_list_contains(page, page->local_free, block) ||
+      mi_list_contains(page, mi_page_thread_free(page), block))
+  {
+    _mi_error_message(EAGAIN, "double free detected of block %p with size %zu\n", block, mi_page_block_size(page));
+    return true;
+  }
+  return false;
+}
+
+#define mi_track_page(page,access)  { size_t psize; void* pstart = _mi_page_start(_mi_page_segment(page),page,&psize); mi_track_mem_##access( pstart, psize); }
+
+static inline bool mi_check_is_double_free(const mi_page_t* page, const mi_block_t* block) {
+  bool is_double_free = false;
+  mi_block_t* n = mi_block_nextx(page, block, page->keys); // pretend it is freed, and get the decoded first field
+  if (((uintptr_t)n & (MI_INTPTR_SIZE-1))==0 &&  // quick check: aligned pointer?
+      (n==NULL || mi_is_in_same_page(block, n))) // quick check: in same page or NULL?
+  {
+    // Suspicious: decoded value a in block is in the same page (or NULL) -- maybe a double free?
+    // (continue in separate function to improve code generation)
+    is_double_free = mi_check_is_double_freex(page, block);
+  }
+  return is_double_free;
+}
+#else
+static inline bool mi_check_is_double_free(const mi_page_t* page, const mi_block_t* block) {
+  MI_UNUSED(page);
+  MI_UNUSED(block);
+  return false;
+}
+#endif
+
+
+// ---------------------------------------------------------------------------
+// Check for heap block overflow by setting up padding at the end of the block
+// ---------------------------------------------------------------------------
+
+#if MI_PADDING // && !MI_TRACK_ENABLED
+static bool mi_page_decode_padding(const mi_page_t* page, const mi_block_t* block, size_t* delta, size_t* bsize) {
+  *bsize = mi_page_usable_block_size(page);
+  const mi_padding_t* const padding = (mi_padding_t*)((uint8_t*)block + *bsize);
+  mi_track_mem_defined(padding,sizeof(mi_padding_t));
+  *delta = padding->delta;
+  uint32_t canary = padding->canary;
+  uintptr_t keys[2];
+  keys[0] = page->keys[0];
+  keys[1] = page->keys[1];
+  bool ok = ((uint32_t)mi_ptr_encode(page,block,keys) == canary && *delta <= *bsize);
+  mi_track_mem_noaccess(padding,sizeof(mi_padding_t));
+  return ok;
+}
+
+// Return the exact usable size of a block.
+static size_t mi_page_usable_size_of(const mi_page_t* page, const mi_block_t* block) {
+  size_t bsize;
+  size_t delta;
+  bool ok = mi_page_decode_padding(page, block, &delta, &bsize);
+  mi_assert_internal(ok); mi_assert_internal(delta <= bsize);
+  return (ok ? bsize - delta : 0);
+}
+
+// When a non-thread-local block is freed, it becomes part of the thread delayed free
+// list that is freed later by the owning heap. If the exact usable size is too small to
+// contain the pointer for the delayed list, then shrink the padding (by decreasing delta)
+// so it will later not trigger an overflow error in `mi_free_block`.
+void _mi_padding_shrink(const mi_page_t* page, const mi_block_t* block, const size_t min_size) {
+  size_t bsize;
+  size_t delta;
+  bool ok = mi_page_decode_padding(page, block, &delta, &bsize);
+  mi_assert_internal(ok);
+  if (!ok || (bsize - delta) >= min_size) return;  // usually already enough space
+  mi_assert_internal(bsize >= min_size);
+  if (bsize < min_size) return;  // should never happen
+  size_t new_delta = (bsize - min_size);
+  mi_assert_internal(new_delta < bsize);
+  mi_padding_t* padding = (mi_padding_t*)((uint8_t*)block + bsize);
+  mi_track_mem_defined(padding,sizeof(mi_padding_t));
+  padding->delta = (uint32_t)new_delta;
+  mi_track_mem_noaccess(padding,sizeof(mi_padding_t));
+}
+#else
+static size_t mi_page_usable_size_of(const mi_page_t* page, const mi_block_t* block) {
+  MI_UNUSED(block);
+  return mi_page_usable_block_size(page);
+}
+
+void _mi_padding_shrink(const mi_page_t* page, const mi_block_t* block, const size_t min_size) {
+  MI_UNUSED(page);
+  MI_UNUSED(block);
+  MI_UNUSED(min_size);
+}
+#endif
+
+#if MI_PADDING && MI_PADDING_CHECK
+
+static bool mi_verify_padding(const mi_page_t* page, const mi_block_t* block, size_t* size, size_t* wrong) {
+  size_t bsize;
+  size_t delta;
+  bool ok = mi_page_decode_padding(page, block, &delta, &bsize);
+  *size = *wrong = bsize;
+  if (!ok) return false;
+  mi_assert_internal(bsize >= delta);
+  *size = bsize - delta;
+  if (!mi_page_is_huge(page)) {
+    uint8_t* fill = (uint8_t*)block + bsize - delta;
+    const size_t maxpad = (delta > MI_MAX_ALIGN_SIZE ? MI_MAX_ALIGN_SIZE : delta); // check at most the first N padding bytes
+    mi_track_mem_defined(fill, maxpad);
+    for (size_t i = 0; i < maxpad; i++) {
+      if (fill[i] != MI_DEBUG_PADDING) {
+        *wrong = bsize - delta + i;
+        ok = false;
+        break;
+      }
+    }
+    mi_track_mem_noaccess(fill, maxpad);
+  }
+  return ok;
+}
+
+static void mi_check_padding(const mi_page_t* page, const mi_block_t* block) {
+  size_t size;
+  size_t wrong;
+  if (!mi_verify_padding(page,block,&size,&wrong)) {
+    _mi_error_message(EFAULT, "buffer overflow in heap block %p of size %zu: write after %zu bytes\n", block, size, wrong );
+  }
+}
+
+#else
+
+static void mi_check_padding(const mi_page_t* page, const mi_block_t* block) {
+  MI_UNUSED(page);
+  MI_UNUSED(block);
+}
+
+#endif
+
+// only maintain stats for smaller objects if requested
+#if (MI_STAT>0)
+static void mi_stat_free(const mi_page_t* page, const mi_block_t* block) {
+#if (MI_STAT < 2)
+  MI_UNUSED(block);
+#endif
+  mi_heap_t* const heap = mi_heap_get_default();
+  const size_t bsize = mi_page_usable_block_size(page);
+#if (MI_STAT>1)
+  const size_t usize = mi_page_usable_size_of(page, block);
+  mi_heap_stat_decrease(heap, malloc, usize);
+#endif
+  if (bsize <= MI_LARGE_OBJ_SIZE_MAX) {
+    mi_heap_stat_decrease(heap, normal, bsize);
+#if (MI_STAT > 1)
+    mi_heap_stat_decrease(heap, normal_bins[_mi_bin(bsize)], 1);
+#endif
+  }
+  else {
+    const size_t bpsize = mi_page_block_size(page);  // match stat in page.c:mi_huge_page_alloc
+    mi_heap_stat_decrease(heap, huge, bpsize);
+  }
+}
+#else
+static void mi_stat_free(const mi_page_t* page, const mi_block_t* block) {
+  MI_UNUSED(page); MI_UNUSED(block);
+}
+#endif
diff --git a/contrib/libs/mimalloc/src/heap.c b/contrib/libs/mimalloc/src/heap.c
index bda10699d076..f6f2354913a6 100644
--- a/contrib/libs/mimalloc/src/heap.c
+++ b/contrib/libs/mimalloc/src/heap.c
@@ -6,8 +6,9 @@ terms of the MIT license. A copy of the license can be found in the file
 -----------------------------------------------------------------------------*/
 
 #include "mimalloc.h"
-#include "mimalloc-internal.h"
-#include "mimalloc-atomic.h"
+#include "mimalloc/internal.h"
+#include "mimalloc/atomic.h"
+#include "mimalloc/prim.h"  // mi_prim_get_default_heap
 
 #include <string.h>  // memset, memcpy
 
@@ -30,15 +31,18 @@ static bool mi_heap_visit_pages(mi_heap_t* heap, heap_page_visitor_fun* fn, void
   // visit all pages
   #if MI_DEBUG>1
   size_t total = heap->page_count;
-  #endif
   size_t count = 0;
+  #endif  
+
   for (size_t i = 0; i <= MI_BIN_FULL; i++) {
     mi_page_queue_t* pq = &heap->pages[i];
     mi_page_t* page = pq->first;
     while(page != NULL) {
       mi_page_t* next = page->next; // save next in case the page gets removed from the queue
       mi_assert_internal(mi_page_heap(page) == heap);
+      #if MI_DEBUG>1
       count++;
+      #endif
       if (!fn(heap, pq, page, arg1, arg2)) return false;
       page = next; // and continue
     }
@@ -50,9 +54,9 @@ static bool mi_heap_visit_pages(mi_heap_t* heap, heap_page_visitor_fun* fn, void
 
 #if MI_DEBUG>=2
 static bool mi_heap_page_is_valid(mi_heap_t* heap, mi_page_queue_t* pq, mi_page_t* page, void* arg1, void* arg2) {
-  UNUSED(arg1);
-  UNUSED(arg2);
-  UNUSED(pq);
+  MI_UNUSED(arg1);
+  MI_UNUSED(arg2);
+  MI_UNUSED(pq);
   mi_assert_internal(mi_page_heap(page) == heap);
   mi_segment_t* segment = _mi_page_segment(page);
   mi_assert_internal(segment->thread_id == heap->thread_id);
@@ -86,13 +90,13 @@ typedef enum mi_collect_e {
 
 
 static bool mi_heap_page_collect(mi_heap_t* heap, mi_page_queue_t* pq, mi_page_t* page, void* arg_collect, void* arg2 ) {
-  UNUSED(arg2);
-  UNUSED(heap);
+  MI_UNUSED(arg2);
+  MI_UNUSED(heap);
   mi_assert_internal(mi_heap_page_is_valid(heap, pq, page, NULL, NULL));
   mi_collect_t collect = *((mi_collect_t*)arg_collect);
   _mi_page_free_collect(page, collect >= MI_FORCE);
   if (mi_page_all_free(page)) {
-    // no more used blocks, free the page. 
+    // no more used blocks, free the page.
     // note: this will free retired pages as well.
     _mi_page_free(page, pq, collect >= MI_FORCE);
   }
@@ -104,10 +108,10 @@ static bool mi_heap_page_collect(mi_heap_t* heap, mi_page_queue_t* pq, mi_page_t
 }
 
 static bool mi_heap_page_never_delayed_free(mi_heap_t* heap, mi_page_queue_t* pq, mi_page_t* page, void* arg1, void* arg2) {
-  UNUSED(arg1);
-  UNUSED(arg2);
-  UNUSED(heap);
-  UNUSED(pq);
+  MI_UNUSED(arg1);
+  MI_UNUSED(arg2);
+  MI_UNUSED(heap);
+  MI_UNUSED(pq);
   _mi_page_use_delayed_free(page, MI_NEVER_DELAYED_FREE, false);
   return true; // don't break
 }
@@ -115,47 +119,53 @@ static bool mi_heap_page_never_delayed_free(mi_heap_t* heap, mi_page_queue_t* pq
 static void mi_heap_collect_ex(mi_heap_t* heap, mi_collect_t collect)
 {
   if (heap==NULL || !mi_heap_is_initialized(heap)) return;
-  _mi_deferred_free(heap, collect >= MI_FORCE);
 
-  // note: never reclaim on collect but leave it to threads that need storage to reclaim 
+  const bool force = (collect >= MI_FORCE);
+  _mi_deferred_free(heap, force);
+
+  // python/cpython#112532: we may be called from a thread that is not the owner of the heap
+  const bool is_main_thread = (_mi_is_main_thread() && heap->thread_id == _mi_thread_id());
+
+  // note: never reclaim on collect but leave it to threads that need storage to reclaim
   if (
   #ifdef NDEBUG
       collect == MI_FORCE
   #else
       collect >= MI_FORCE
   #endif
-    && _mi_is_main_thread() && mi_heap_is_backing(heap) && !heap->no_reclaim)
+    && is_main_thread && mi_heap_is_backing(heap) && !heap->no_reclaim)
   {
     // the main thread is abandoned (end-of-program), try to reclaim all abandoned segments.
     // if all memory is freed by now, all segments should be freed.
     _mi_abandoned_reclaim_all(heap, &heap->tld->segments);
   }
-  
+
   // if abandoning, mark all pages to no longer add to delayed_free
   if (collect == MI_ABANDON) {
     mi_heap_visit_pages(heap, &mi_heap_page_never_delayed_free, NULL, NULL);
   }
 
-  // free thread delayed blocks.
+  // free all current thread delayed blocks.
   // (if abandoning, after this there are no more thread-delayed references into the pages.)
-  _mi_heap_delayed_free(heap);
+  _mi_heap_delayed_free_all(heap);
 
   // collect retired pages
-  _mi_heap_collect_retired(heap, collect >= MI_FORCE);
+  _mi_heap_collect_retired(heap, force);
 
   // collect all pages owned by this thread
   mi_heap_visit_pages(heap, &mi_heap_page_collect, &collect, NULL);
   mi_assert_internal( collect != MI_ABANDON || mi_atomic_load_ptr_acquire(mi_block_t,&heap->thread_delayed_free) == NULL );
 
-  // collect segment caches
-  if (collect >= MI_FORCE) {
-    _mi_segment_thread_collect(&heap->tld->segments);
-  }
+  // collect segments (purge pages, this can be expensive so don't force on abandonment)
+  _mi_segments_collect(collect == MI_FORCE, &heap->tld->segments);
 
-  // collect regions on program-exit (or shared library unload)
-  if (collect >= MI_FORCE && _mi_is_main_thread() && mi_heap_is_backing(heap)) {
-    _mi_mem_collect(&heap->tld->os);
+  // if forced, collect thread data cache on program-exit (or shared library unload)
+  if (force && is_main_thread && mi_heap_is_backing(heap)) {
+    _mi_thread_data_collect();  // collect thread data cache
   }
+  
+  // collect arenas (this is program wide so don't force purges on abandonment of threads)
+  _mi_arenas_collect(collect == MI_FORCE /* force purge? */, &heap->tld->stats);  
 }
 
 void _mi_heap_collect_abandon(mi_heap_t* heap) {
@@ -167,7 +177,7 @@ void mi_heap_collect(mi_heap_t* heap, bool force) mi_attr_noexcept {
 }
 
 void mi_collect(bool force) mi_attr_noexcept {
-  mi_heap_collect(mi_get_default_heap(), force);
+  mi_heap_collect(mi_prim_get_default_heap(), force);
 }
 
 
@@ -177,9 +187,14 @@ void mi_collect(bool force) mi_attr_noexcept {
 
 mi_heap_t* mi_heap_get_default(void) {
   mi_thread_init();
-  return mi_get_default_heap();
+  return mi_prim_get_default_heap();
+}
+
+static bool mi_heap_is_default(const mi_heap_t* heap) {
+  return (heap == mi_prim_get_default_heap());
 }
 
+
 mi_heap_t* mi_heap_get_backing(void) {
   mi_heap_t* heap = mi_heap_get_default();
   mi_assert_internal(heap!=NULL);
@@ -189,24 +204,44 @@ mi_heap_t* mi_heap_get_backing(void) {
   return bheap;
 }
 
-mi_heap_t* mi_heap_new(void) {
-  mi_heap_t* bheap = mi_heap_get_backing();
-  mi_heap_t* heap = mi_heap_malloc_tp(bheap, mi_heap_t);  // todo: OS allocate in secure mode?
-  if (heap==NULL) return NULL;
+void _mi_heap_init(mi_heap_t* heap, mi_tld_t* tld, mi_arena_id_t arena_id, bool noreclaim, uint8_t tag) {
   _mi_memcpy_aligned(heap, &_mi_heap_empty, sizeof(mi_heap_t));
-  heap->tld = bheap->tld;
-  heap->thread_id = _mi_thread_id();
-  _mi_random_split(&bheap->random, &heap->random);
+  heap->tld = tld;
+  heap->thread_id  = _mi_thread_id();
+  heap->arena_id   = arena_id;
+  heap->no_reclaim = noreclaim;
+  heap->tag        = tag;
+  if (heap == tld->heap_backing) {
+    _mi_random_init(&heap->random);
+  }
+  else {
+    _mi_random_split(&tld->heap_backing->random, &heap->random);
+  }
   heap->cookie  = _mi_heap_random_next(heap) | 1;
   heap->keys[0] = _mi_heap_random_next(heap);
   heap->keys[1] = _mi_heap_random_next(heap);
-  heap->no_reclaim = true;  // don't reclaim abandoned pages or otherwise destroy is unsafe
   // push on the thread local heaps list
   heap->next = heap->tld->heaps;
   heap->tld->heaps = heap;
+}
+
+mi_decl_nodiscard mi_heap_t* mi_heap_new_in_arena(mi_arena_id_t arena_id) {
+  mi_heap_t* bheap = mi_heap_get_backing();
+  mi_heap_t* heap = mi_heap_malloc_tp(bheap, mi_heap_t);  // todo: OS allocate in secure mode?
+  if (heap == NULL) return NULL;
+  // don't reclaim abandoned pages or otherwise destroy is unsafe  
+  _mi_heap_init(heap, bheap->tld, arena_id, true /* no reclaim */, 0 /* default tag */);
   return heap;
 }
 
+mi_decl_nodiscard mi_heap_t* mi_heap_new(void) {
+  return mi_heap_new_in_arena(_mi_arena_id_none());
+}
+
+bool _mi_heap_memid_is_suitable(mi_heap_t* heap, mi_memid_t memid) {
+  return _mi_arena_memid_is_suitable(memid, heap->arena_id);
+}
+
 uintptr_t _mi_heap_random_next(mi_heap_t* heap) {
   return _mi_random_next(&heap->random);
 }
@@ -217,9 +252,6 @@ static void mi_heap_reset_pages(mi_heap_t* heap) {
   mi_assert_internal(mi_heap_is_initialized(heap));
   // TODO: copy full empty heap instead?
   memset(&heap->pages_free_direct, 0, sizeof(heap->pages_free_direct));
-#ifdef MI_MEDIUM_DIRECT
-  memset(&heap->pages_free_medium, 0, sizeof(heap->pages_free_medium));
-#endif
   _mi_memcpy_aligned(&heap->pages, &_mi_heap_empty.pages, sizeof(heap->pages));
   heap->thread_delayed_free = NULL;
   heap->page_count = 0;
@@ -240,7 +272,7 @@ static void mi_heap_free(mi_heap_t* heap) {
   // remove ourselves from the thread local heaps list
   // linear search but we expect the number of heaps to be relatively small
   mi_heap_t* prev = NULL;
-  mi_heap_t* curr = heap->tld->heaps; 
+  mi_heap_t* curr = heap->tld->heaps;
   while (curr != heap && curr != NULL) {
     prev = curr;
     curr = curr->next;
@@ -256,16 +288,28 @@ static void mi_heap_free(mi_heap_t* heap) {
   mi_free(heap);
 }
 
+// return a heap on the same thread as `heap` specialized for the specified tag (if it exists)
+mi_heap_t* _mi_heap_by_tag(mi_heap_t* heap, uint8_t tag) {
+  if (heap->tag == tag) {
+    return heap;
+  }
+  for (mi_heap_t *curr = heap->tld->heaps; curr != NULL; curr = curr->next) {
+    if (curr->tag == tag) {
+      return curr;
+    }
+  }
+  return NULL;
+}
 
 /* -----------------------------------------------------------
   Heap destroy
 ----------------------------------------------------------- */
 
 static bool _mi_heap_page_destroy(mi_heap_t* heap, mi_page_queue_t* pq, mi_page_t* page, void* arg1, void* arg2) {
-  UNUSED(arg1);
-  UNUSED(arg2);
-  UNUSED(heap);
-  UNUSED(pq);
+  MI_UNUSED(arg1);
+  MI_UNUSED(arg2);
+  MI_UNUSED(heap);
+  MI_UNUSED(pq);
 
   // ensure no more thread_delayed_free will be added
   _mi_page_use_delayed_free(page, MI_NEVER_DELAYED_FREE, false);
@@ -273,12 +317,7 @@ static bool _mi_heap_page_destroy(mi_heap_t* heap, mi_page_queue_t* pq, mi_page_
   // stats
   const size_t bsize = mi_page_block_size(page);
   if (bsize > MI_LARGE_OBJ_SIZE_MAX) {
-    if (bsize > MI_HUGE_OBJ_SIZE_MAX) {
-      mi_heap_stat_decrease(heap, giant, bsize);
-    }
-    else {
-      mi_heap_stat_decrease(heap, huge, bsize);
-    }
+    mi_heap_stat_decrease(heap, huge, bsize);
   }
 #if (MI_STAT)
   _mi_page_free_collect(page, false);  // update used count
@@ -310,6 +349,14 @@ void _mi_heap_destroy_pages(mi_heap_t* heap) {
   mi_heap_reset_pages(heap);
 }
 
+#if MI_TRACK_HEAP_DESTROY
+static bool mi_cdecl mi_heap_track_block_free(const mi_heap_t* heap, const mi_heap_area_t* area, void* block, size_t block_size, void* arg) {
+  MI_UNUSED(heap); MI_UNUSED(area);  MI_UNUSED(arg); MI_UNUSED(block_size);
+  mi_track_free_size(block,mi_usable_size(block));
+  return true;
+}
+#endif
+
 void mi_heap_destroy(mi_heap_t* heap) {
   mi_assert(heap != NULL);
   mi_assert(mi_heap_is_initialized(heap));
@@ -321,27 +368,45 @@ void mi_heap_destroy(mi_heap_t* heap) {
     mi_heap_delete(heap);
   }
   else {
+    // track all blocks as freed
+    #if MI_TRACK_HEAP_DESTROY
+    mi_heap_visit_blocks(heap, true, mi_heap_track_block_free, NULL);
+    #endif
     // free all pages
     _mi_heap_destroy_pages(heap);
     mi_heap_free(heap);
   }
 }
 
-
+// forcefully destroy all heaps in the current thread
+void _mi_heap_unsafe_destroy_all(void) {
+  mi_heap_t* bheap = mi_heap_get_backing();
+  mi_heap_t* curr = bheap->tld->heaps;
+  while (curr != NULL) {
+    mi_heap_t* next = curr->next;
+    if (curr->no_reclaim) {
+      mi_heap_destroy(curr);
+    }
+    else {
+      _mi_heap_destroy_pages(curr);
+    }
+    curr = next;
+  }
+}
 
 /* -----------------------------------------------------------
   Safe Heap delete
 ----------------------------------------------------------- */
 
-// Tranfer the pages from one heap to the other
+// Transfer the pages from one heap to the other
 static void mi_heap_absorb(mi_heap_t* heap, mi_heap_t* from) {
   mi_assert_internal(heap!=NULL);
   if (from==NULL || from->page_count == 0) return;
 
   // reduce the size of the delayed frees
-  _mi_heap_delayed_free(from);
-  
-  // transfer all pages by appending the queues; this will set a new heap field 
+  _mi_heap_delayed_free_partial(from);
+
+  // transfer all pages by appending the queues; this will set a new heap field
   // so threads may do delayed frees in either heap for a while.
   // note: appending waits for each page to not be in the `MI_DELAYED_FREEING` state
   // so after this only the new heap will get delayed frees
@@ -354,17 +419,17 @@ static void mi_heap_absorb(mi_heap_t* heap, mi_heap_t* from) {
   }
   mi_assert_internal(from->page_count == 0);
 
-  // and do outstanding delayed frees in the `from` heap  
+  // and do outstanding delayed frees in the `from` heap
   // note: be careful here as the `heap` field in all those pages no longer point to `from`,
-  // turns out to be ok as `_mi_heap_delayed_free` only visits the list and calls a 
+  // turns out to be ok as `_mi_heap_delayed_free` only visits the list and calls a
   // the regular `_mi_free_delayed_block` which is safe.
-  _mi_heap_delayed_free(from);  
+  _mi_heap_delayed_free_all(from);
   #if !defined(_MSC_VER) || (_MSC_VER > 1900) // somehow the following line gives an error in VS2015, issue #353
   mi_assert_internal(mi_atomic_load_ptr_relaxed(mi_block_t,&from->thread_delayed_free) == NULL);
   #endif
 
   // and reset the `from` heap
-  mi_heap_reset_pages(from);  
+  mi_heap_reset_pages(from);
 }
 
 // Safe delete a heap without freeing any still allocated blocks in that heap.
@@ -376,7 +441,7 @@ void mi_heap_delete(mi_heap_t* heap)
   if (heap==NULL || !mi_heap_is_initialized(heap)) return;
 
   if (!mi_heap_is_backing(heap)) {
-    // tranfer still used pages to the backing heap
+    // transfer still used pages to the backing heap
     mi_heap_absorb(heap->tld->heap_backing, heap);
   }
   else {
@@ -392,7 +457,7 @@ mi_heap_t* mi_heap_set_default(mi_heap_t* heap) {
   mi_assert(mi_heap_is_initialized(heap));
   if (heap==NULL || !mi_heap_is_initialized(heap)) return NULL;
   mi_assert_expensive(mi_heap_is_valid(heap));
-  mi_heap_t* old = mi_get_default_heap();
+  mi_heap_t* old = mi_prim_get_default_heap();
   _mi_heap_set_default_direct(heap);
   return old;
 }
@@ -410,7 +475,7 @@ static mi_heap_t* mi_heap_of_block(const void* p) {
   mi_segment_t* segment = _mi_ptr_segment(p);
   bool valid = (_mi_ptr_cookie(segment) == segment->cookie);
   mi_assert_internal(valid);
-  if (mi_unlikely(!valid)) return NULL;
+  if mi_unlikely(!valid) return NULL;
   return mi_page_heap(_mi_segment_page_of(segment,p));
 }
 
@@ -422,11 +487,10 @@ bool mi_heap_contains_block(mi_heap_t* heap, const void* p) {
 
 
 static bool mi_heap_page_check_owned(mi_heap_t* heap, mi_page_queue_t* pq, mi_page_t* page, void* p, void* vfound) {
-  UNUSED(heap);
-  UNUSED(pq);
+  MI_UNUSED(heap);
+  MI_UNUSED(pq);
   bool* found = (bool*)vfound;
-  mi_segment_t* segment = _mi_page_segment(page);
-  void* start = _mi_page_start(segment, page, NULL);
+  void* start = mi_page_start(page);
   void* end   = (uint8_t*)start + (page->capacity * mi_page_block_size(page));
   *found = (p >= start && p < end);
   return (!*found); // continue if not found
@@ -442,7 +506,7 @@ bool mi_heap_check_owned(mi_heap_t* heap, const void* p) {
 }
 
 bool mi_check_owned(const void* p) {
-  return mi_heap_check_owned(mi_get_default_heap(), p);
+  return mi_heap_check_owned(mi_prim_get_default_heap(), p);
 }
 
 /* -----------------------------------------------------------
@@ -470,13 +534,14 @@ static bool mi_heap_area_visit_blocks(const mi_heap_area_ex_t* xarea, mi_block_v
   if (page->used == 0) return true;
 
   const size_t bsize = mi_page_block_size(page);
+  const size_t ubsize = mi_page_usable_block_size(page); // without padding
   size_t   psize;
-  uint8_t* pstart = _mi_page_start(_mi_page_segment(page), page, &psize);
+  uint8_t* pstart = _mi_segment_page_start(_mi_page_segment(page), page, &psize);
 
   if (page->capacity == 1) {
     // optimize page with one block
     mi_assert_internal(page->used == 1 && page->free == NULL);
-    return visitor(mi_page_heap(page), area, pstart, bsize, arg);
+    return visitor(mi_page_heap(page), area, pstart, ubsize, arg);
   }
 
   // create a bitmap of free blocks.
@@ -484,9 +549,13 @@ static bool mi_heap_area_visit_blocks(const mi_heap_area_ex_t* xarea, mi_block_v
   uintptr_t free_map[MI_MAX_BLOCKS / sizeof(uintptr_t)];
   memset(free_map, 0, sizeof(free_map));
 
+  #if MI_DEBUG>1
   size_t free_count = 0;
+  #endif
   for (mi_block_t* block = page->free; block != NULL; block = mi_block_next(page,block)) {
+    #if MI_DEBUG>1
     free_count++;
+    #endif
     mi_assert_internal((uint8_t*)block >= pstart && (uint8_t*)block < (pstart + psize));
     size_t offset = (uint8_t*)block - pstart;
     mi_assert_internal(offset % bsize == 0);
@@ -499,7 +568,9 @@ static bool mi_heap_area_visit_blocks(const mi_heap_area_ex_t* xarea, mi_block_v
   mi_assert_internal(page->capacity == (free_count + page->used));
 
   // walk through all blocks skipping the free ones
+  #if MI_DEBUG>1
   size_t used_count = 0;
+  #endif
   for (size_t i = 0; i < page->capacity; i++) {
     size_t bitidx = (i / sizeof(uintptr_t));
     size_t bit = i - (bitidx * sizeof(uintptr_t));
@@ -508,9 +579,11 @@ static bool mi_heap_area_visit_blocks(const mi_heap_area_ex_t* xarea, mi_block_v
       i += (sizeof(uintptr_t) - 1); // skip a run of free blocks
     }
     else if ((m & ((uintptr_t)1 << bit)) == 0) {
+      #if MI_DEBUG>1
       used_count++;
+      #endif
       uint8_t* block = pstart + (i * bsize);
-      if (!visitor(mi_page_heap(page), area, block, bsize, arg)) return false;
+      if (!visitor(mi_page_heap(page), area, block, ubsize, arg)) return false;
     }
   }
   mi_assert_internal(page->used == used_count);
@@ -521,17 +594,19 @@ typedef bool (mi_heap_area_visit_fun)(const mi_heap_t* heap, const mi_heap_area_
 
 
 static bool mi_heap_visit_areas_page(mi_heap_t* heap, mi_page_queue_t* pq, mi_page_t* page, void* vfun, void* arg) {
-  UNUSED(heap);
-  UNUSED(pq);
+  MI_UNUSED(heap);
+  MI_UNUSED(pq);
   mi_heap_area_visit_fun* fun = (mi_heap_area_visit_fun*)vfun;
   mi_heap_area_ex_t xarea;
   const size_t bsize = mi_page_block_size(page);
+  const size_t ubsize = mi_page_usable_block_size(page);
   xarea.page = page;
   xarea.area.reserved = page->reserved * bsize;
   xarea.area.committed = page->capacity * bsize;
-  xarea.area.blocks = _mi_page_start(_mi_page_segment(page), page, NULL);
-  xarea.area.used = page->used;
-  xarea.area.block_size = bsize;
+  xarea.area.blocks = mi_page_start(page);
+  xarea.area.used = page->used;   // number of blocks in use (#553)
+  xarea.area.block_size = ubsize;
+  xarea.area.full_block_size = bsize;
   return fun(heap, &xarea, arg);
 }
 
diff --git a/contrib/libs/mimalloc/src/init.c b/contrib/libs/mimalloc/src/init.c
index c0f09b5ed807..62bb69ddcbd5 100644
--- a/contrib/libs/mimalloc/src/init.c
+++ b/contrib/libs/mimalloc/src/init.c
@@ -1,33 +1,42 @@
 /* ----------------------------------------------------------------------------
-Copyright (c) 2018-2021, Microsoft Research, Daan Leijen
+Copyright (c) 2018-2022, Microsoft Research, Daan Leijen
 This is free software; you can redistribute it and/or modify it under the
 terms of the MIT license. A copy of the license can be found in the file
 "LICENSE" at the root of this distribution.
 -----------------------------------------------------------------------------*/
 #include "mimalloc.h"
-#include "mimalloc-internal.h"
+#include "mimalloc/internal.h"
+#include "mimalloc/prim.h"
 
 #include <string.h>  // memcpy, memset
 #include <stdlib.h>  // atexit
 
+
 // Empty page used to initialize the small free pages array
 const mi_page_t _mi_page_empty = {
-  0, false, false, false, false,
+  0,
+  false, false, false, false,
   0,       // capacity
   0,       // reserved capacity
   { 0 },   // flags
   false,   // is_zero
   0,       // retire_expire
   NULL,    // free
-  #if MI_ENCODE_FREELIST
+  NULL,    // local_free
+  0,       // used
+  0,       // block size shift
+  0,       // heap tag
+  0,       // block_size
+  NULL,    // page_start
+  #if (MI_PADDING || MI_ENCODE_FREELIST)
   { 0, 0 },
   #endif
-  0,       // used
-  0,       // xblock_size
-  NULL,    // local_free
-  ATOMIC_VAR_INIT(0), // xthread_free
-  ATOMIC_VAR_INIT(0), // xheap
+  MI_ATOMIC_VAR_INIT(0), // xthread_free
+  MI_ATOMIC_VAR_INIT(0), // xheap
   NULL, NULL
+  #if MI_INTPTR_SIZE==4 
+  , { NULL }
+  #endif
 };
 
 #define MI_PAGE_EMPTY() ((mi_page_t*)&_mi_page_empty)
@@ -74,7 +83,9 @@ const mi_page_t _mi_page_empty = {
   MI_STAT_COUNT_NULL(), MI_STAT_COUNT_NULL(), \
   MI_STAT_COUNT_NULL(), MI_STAT_COUNT_NULL(), \
   MI_STAT_COUNT_NULL(), MI_STAT_COUNT_NULL(), \
-  { 0, 0 }, { 0, 0 }, { 0, 0 }, { 0, 0 },     \
+  MI_STAT_COUNT_NULL(), \
+  { 0, 0 }, { 0, 0 }, { 0, 0 }, { 0, 0 }, \
+  { 0, 0 }, { 0, 0 }, { 0, 0 }, { 0, 0 }, \
   { 0, 0 }, { 0, 0 }, { 0, 0 }, { 0, 0 } \
   MI_STAT_COUNT_END_NULL()
 
@@ -89,19 +100,26 @@ const mi_page_t _mi_page_empty = {
 
 mi_decl_cache_align const mi_heap_t _mi_heap_empty = {
   NULL,
-  MI_SMALL_PAGES_EMPTY,
-  MI_PAGE_QUEUES_EMPTY,
-  ATOMIC_VAR_INIT(NULL),
+  MI_ATOMIC_VAR_INIT(NULL),
   0,                // tid
   0,                // cookie
+  0,                // arena id
   { 0, 0 },         // keys
-  { {0}, {0}, 0 },
+  { {0}, {0}, 0, true }, // random
   0,                // page count
   MI_BIN_FULL, 0,   // page retired min/max
   NULL,             // next
-  false
+  false,            // can reclaim
+  0,                // tag
+  MI_SMALL_PAGES_EMPTY,
+  MI_PAGE_QUEUES_EMPTY
 };
 
+
+mi_threadid_t _mi_thread_id(void) mi_attr_noexcept {
+  return _mi_prim_thread_id();
+}
+
 // the thread-local default heap for allocation
 mi_decl_thread mi_heap_t* _mi_heap_default = (mi_heap_t*)&_mi_heap_empty;
 
@@ -111,7 +129,7 @@ static mi_tld_t tld_main = {
   0, false,
   &_mi_heap_main, &_mi_heap_main,
   { { NULL, NULL }, {NULL ,NULL}, {NULL ,NULL, 0},
-    0, 0, 0, 0, 0, 0, NULL,
+    0, 0, 0, 0, 0,
     &tld_main.stats, &tld_main.os
   }, // segments
   { 0, &tld_main.stats },  // os
@@ -120,17 +138,19 @@ static mi_tld_t tld_main = {
 
 mi_heap_t _mi_heap_main = {
   &tld_main,
-  MI_SMALL_PAGES_EMPTY,
-  MI_PAGE_QUEUES_EMPTY,
-  ATOMIC_VAR_INIT(NULL),
+  MI_ATOMIC_VAR_INIT(NULL),
   0,                // thread id
   0,                // initial cookie
+  0,                // arena id
   { 0, 0 },         // the key of the main heap can be fixed (unlike page keys that need to be secure!)
-  { {0x846ca68b}, {0}, 0 },  // random
+  { {0x846ca68b}, {0}, 0, true },  // random
   0,                // page count
   MI_BIN_FULL, 0,   // page retired min/max
   NULL,             // next heap
-  false             // can reclaim
+  false,            // can reclaim
+  0,                // tag
+  MI_SMALL_PAGES_EMPTY,
+  MI_PAGE_QUEUES_EMPTY
 };
 
 bool _mi_process_is_initialized = false;  // set to `true` in `mi_process_init`.
@@ -141,8 +161,13 @@ mi_stats_t _mi_stats_main = { MI_STATS_NULL };
 static void mi_heap_main_init(void) {
   if (_mi_heap_main.cookie == 0) {
     _mi_heap_main.thread_id = _mi_thread_id();
-    _mi_heap_main.cookie = _os_random_weak((uintptr_t)&mi_heap_main_init);
-    _mi_random_init(&_mi_heap_main.random);
+    _mi_heap_main.cookie = 1;
+    #if defined(_WIN32) && !defined(MI_SHARED_LIB)
+      _mi_random_init_weak(&_mi_heap_main.random);    // prevent allocation failure during bcrypt dll initialization with static linking
+    #else
+      _mi_random_init(&_mi_heap_main.random);
+    #endif
+    _mi_heap_main.cookie  = _mi_heap_random_next(&_mi_heap_main);
     _mi_heap_main.keys[0] = _mi_heap_random_next(&_mi_heap_main);
     _mi_heap_main.keys[1] = _mi_heap_random_next(&_mi_heap_main);
   }
@@ -160,54 +185,123 @@ mi_heap_t* _mi_heap_main_get(void) {
 
 // note: in x64 in release build `sizeof(mi_thread_data_t)` is under 4KiB (= OS page size).
 typedef struct mi_thread_data_s {
-  mi_heap_t  heap;  // must come first due to cast in `_mi_heap_done`
+  mi_heap_t  heap;   // must come first due to cast in `_mi_heap_done`
   mi_tld_t   tld;
+  mi_memid_t memid;  // must come last due to zero'ing
 } mi_thread_data_t;
 
+
+// Thread meta-data is allocated directly from the OS. For
+// some programs that do not use thread pools and allocate and
+// destroy many OS threads, this may causes too much overhead
+// per thread so we maintain a small cache of recently freed metadata.
+
+#define TD_CACHE_SIZE (16)
+static _Atomic(mi_thread_data_t*) td_cache[TD_CACHE_SIZE];
+
+static mi_thread_data_t* mi_thread_data_zalloc(void) {
+  // try to find thread metadata in the cache
+  bool is_zero = false;
+  mi_thread_data_t* td = NULL;
+  for (int i = 0; i < TD_CACHE_SIZE; i++) {
+    td = mi_atomic_load_ptr_relaxed(mi_thread_data_t, &td_cache[i]);
+    if (td != NULL) {
+      // found cached allocation, try use it
+      td = mi_atomic_exchange_ptr_acq_rel(mi_thread_data_t, &td_cache[i], NULL);
+      if (td != NULL) {
+        break;
+      }
+    }
+  }
+
+  // if that fails, allocate as meta data
+  if (td == NULL) {
+    mi_memid_t memid;
+    td = (mi_thread_data_t*)_mi_os_alloc(sizeof(mi_thread_data_t), &memid, &_mi_stats_main);
+    if (td == NULL) {
+      // if this fails, try once more. (issue #257)
+      td = (mi_thread_data_t*)_mi_os_alloc(sizeof(mi_thread_data_t), &memid, &_mi_stats_main);
+      if (td == NULL) {
+        // really out of memory
+        _mi_error_message(ENOMEM, "unable to allocate thread local heap metadata (%zu bytes)\n", sizeof(mi_thread_data_t));
+      }
+    }
+    if (td != NULL) {
+      td->memid = memid;
+      is_zero = memid.initially_zero;
+    }
+  }
+
+  if (td != NULL && !is_zero) {
+    _mi_memzero_aligned(td, offsetof(mi_thread_data_t,memid));
+  }
+  return td;
+}
+
+static void mi_thread_data_free( mi_thread_data_t* tdfree ) {
+  // try to add the thread metadata to the cache
+  for (int i = 0; i < TD_CACHE_SIZE; i++) {
+    mi_thread_data_t* td = mi_atomic_load_ptr_relaxed(mi_thread_data_t, &td_cache[i]);
+    if (td == NULL) {
+      mi_thread_data_t* expected = NULL;
+      if (mi_atomic_cas_ptr_weak_acq_rel(mi_thread_data_t, &td_cache[i], &expected, tdfree)) {
+        return;
+      }
+    }
+  }
+  // if that fails, just free it directly
+  _mi_os_free(tdfree, sizeof(mi_thread_data_t), tdfree->memid, &_mi_stats_main);
+}
+
+void _mi_thread_data_collect(void) {
+  // free all thread metadata from the cache
+  for (int i = 0; i < TD_CACHE_SIZE; i++) {
+    mi_thread_data_t* td = mi_atomic_load_ptr_relaxed(mi_thread_data_t, &td_cache[i]);
+    if (td != NULL) {
+      td = mi_atomic_exchange_ptr_acq_rel(mi_thread_data_t, &td_cache[i], NULL);
+      if (td != NULL) {
+        _mi_os_free(td, sizeof(mi_thread_data_t), td->memid, &_mi_stats_main);
+      }
+    }
+  }
+}
+
 // Initialize the thread local default heap, called from `mi_thread_init`
-static bool _mi_heap_init(void) {
-  if (mi_heap_is_initialized(mi_get_default_heap())) return true;
+static bool _mi_thread_heap_init(void) {
+  if (mi_heap_is_initialized(mi_prim_get_default_heap())) return true;
   if (_mi_is_main_thread()) {
     // mi_assert_internal(_mi_heap_main.thread_id != 0);  // can happen on freeBSD where alloc is called before any initialization
     // the main heap is statically allocated
     mi_heap_main_init();
     _mi_heap_set_default_direct(&_mi_heap_main);
-    //mi_assert_internal(_mi_heap_default->tld->heap_backing == mi_get_default_heap());
+    //mi_assert_internal(_mi_heap_default->tld->heap_backing == mi_prim_get_default_heap());
   }
   else {
     // use `_mi_os_alloc` to allocate directly from the OS
-    mi_thread_data_t* td = (mi_thread_data_t*)_mi_os_alloc(sizeof(mi_thread_data_t), &_mi_stats_main); // Todo: more efficient allocation?
-    if (td == NULL) {
-      // if this fails, try once more. (issue #257)
-      td = (mi_thread_data_t*)_mi_os_alloc(sizeof(mi_thread_data_t), &_mi_stats_main);
-      if (td == NULL) {
-        // really out of memory
-        _mi_error_message(ENOMEM, "unable to allocate thread local heap metadata (%zu bytes)\n", sizeof(mi_thread_data_t));
-        return false;
-      }
-    }
-    // OS allocated so already zero initialized
+    mi_thread_data_t* td = mi_thread_data_zalloc();
+    if (td == NULL) return false;
+
     mi_tld_t*  tld = &td->tld;
     mi_heap_t* heap = &td->heap;
-    _mi_memcpy_aligned(heap, &_mi_heap_empty, sizeof(*heap));
-    heap->thread_id = _mi_thread_id();
-    _mi_random_init(&heap->random);
-    heap->cookie  = _mi_heap_random_next(heap) | 1;
-    heap->keys[0] = _mi_heap_random_next(heap);
-    heap->keys[1] = _mi_heap_random_next(heap);
-    heap->tld = tld;
-    tld->heap_backing = heap;
-    tld->heaps = heap;
-    tld->segments.stats = &tld->stats;
-    tld->segments.os = &tld->os;
-    tld->os.stats = &tld->stats;
-    _mi_heap_set_default_direct(heap);    
+    _mi_tld_init(tld, heap);  // must be before `_mi_heap_init`
+    _mi_heap_init(heap, tld, _mi_arena_id_none(), false /* can reclaim */, 0 /* default tag */);
+    _mi_heap_set_default_direct(heap);   
   }
   return false;
 }
 
+// initialize thread local data
+void _mi_tld_init(mi_tld_t* tld, mi_heap_t* bheap) {
+  _mi_memzero_aligned(tld,sizeof(mi_tld_t));
+  tld->heap_backing = bheap;
+  tld->heaps = NULL;
+  tld->segments.stats = &tld->stats;
+  tld->segments.os = &tld->os;
+  tld->os.stats = &tld->stats;
+}
+
 // Free the thread local default heap (called from `mi_thread_done`)
-static bool _mi_heap_done(mi_heap_t* heap) {
+static bool _mi_thread_heap_done(mi_heap_t* heap) {
   if (!mi_heap_is_initialized(heap)) return true;
 
   // reset default heap
@@ -234,23 +328,23 @@ static bool _mi_heap_done(mi_heap_t* heap) {
   if (heap != &_mi_heap_main) {
     _mi_heap_collect_abandon(heap);
   }
-  
+
   // merge stats
-  _mi_stats_done(&heap->tld->stats);  
+  _mi_stats_done(&heap->tld->stats);
 
   // free if not the main thread
   if (heap != &_mi_heap_main) {
     mi_assert_internal(heap->tld->segments.count == 0 || heap->thread_id != _mi_thread_id());
-    _mi_os_free(heap, sizeof(mi_thread_data_t), &_mi_stats_main);
+    mi_thread_data_free((mi_thread_data_t*)heap);
   }
-#if 0  
-  // never free the main thread even in debug mode; if a dll is linked statically with mimalloc,
-  // there may still be delete/free calls after the mi_fls_done is called. Issue #207
   else {
+    #if 0
+    // never free the main thread even in debug mode; if a dll is linked statically with mimalloc,
+    // there may still be delete/free calls after the mi_fls_done is called. Issue #207
     _mi_heap_destroy_pages(heap);
     mi_assert_internal(heap->tld->heap_backing == &_mi_heap_main);
+    #endif
   }
-#endif
   return false;
 }
 
@@ -272,57 +366,12 @@ static bool _mi_heap_done(mi_heap_t* heap) {
 // to set up the thread local keys.
 // --------------------------------------------------------
 
-static void _mi_thread_done(mi_heap_t* default_heap);
-
-#ifdef __wasi__
-// no pthreads in the WebAssembly Standard Interface
-#elif !defined(_WIN32)
-#define MI_USE_PTHREADS
-#endif
-
-#if defined(_WIN32) && defined(MI_SHARED_LIB)
-  // nothing to do as it is done in DllMain
-#elif defined(_WIN32) && !defined(MI_SHARED_LIB)
-  // use thread local storage keys to detect thread ending
-  #include <windows.h>
-  #include <fibersapi.h>
-  #if (_WIN32_WINNT < 0x600)  // before Windows Vista 
-  WINBASEAPI DWORD WINAPI FlsAlloc( _In_opt_ PFLS_CALLBACK_FUNCTION lpCallback );
-  WINBASEAPI PVOID WINAPI FlsGetValue( _In_ DWORD dwFlsIndex );
-  WINBASEAPI BOOL  WINAPI FlsSetValue( _In_ DWORD dwFlsIndex, _In_opt_ PVOID lpFlsData );
-  WINBASEAPI BOOL  WINAPI FlsFree(_In_ DWORD dwFlsIndex);
-  #endif
-  static DWORD mi_fls_key = (DWORD)(-1);
-  static void NTAPI mi_fls_done(PVOID value) {
-    if (value!=NULL) _mi_thread_done((mi_heap_t*)value);
-  }
-#elif defined(MI_USE_PTHREADS)
-  // use pthread local storage keys to detect thread ending
-  // (and used with MI_TLS_PTHREADS for the default heap)
-  #include <pthread.h>
-  pthread_key_t _mi_heap_default_key = (pthread_key_t)(-1);
-  static void mi_pthread_done(void* value) {
-    if (value!=NULL) _mi_thread_done((mi_heap_t*)value);
-  }
-#elif defined(__wasi__)
-// no pthreads in the WebAssembly Standard Interface
-#else
-  #pragma message("define a way to call mi_thread_done when a thread is done")
-#endif
-
 // Set up handlers so `mi_thread_done` is called automatically
 static void mi_process_setup_auto_thread_done(void) {
   static bool tls_initialized = false; // fine if it races
   if (tls_initialized) return;
   tls_initialized = true;
-  #if defined(_WIN32) && defined(MI_SHARED_LIB)
-    // nothing to do as it is done in DllMain
-  #elif defined(_WIN32) && !defined(MI_SHARED_LIB)
-    mi_fls_key = FlsAlloc(&mi_fls_done);
-  #elif defined(MI_USE_PTHREADS)
-    mi_assert_internal(_mi_heap_default_key == (pthread_key_t)(-1));
-    pthread_key_create(&_mi_heap_default_key, &mi_pthread_done);
-  #endif
+  _mi_prim_thread_init_auto_done();
   _mi_heap_set_default_direct(&_mi_heap_main);
 }
 
@@ -331,41 +380,62 @@ bool _mi_is_main_thread(void) {
   return (_mi_heap_main.thread_id==0 || _mi_heap_main.thread_id == _mi_thread_id());
 }
 
+static _Atomic(size_t) thread_count = MI_ATOMIC_VAR_INIT(1);
+
+size_t  _mi_current_thread_count(void) {
+  return mi_atomic_load_relaxed(&thread_count);
+}
+
 // This is called from the `mi_malloc_generic`
 void mi_thread_init(void) mi_attr_noexcept
 {
   // ensure our process has started already
   mi_process_init();
-  
+
   // initialize the thread local default heap
   // (this will call `_mi_heap_set_default_direct` and thus set the
   //  fiber/pthread key to a non-zero value, ensuring `_mi_thread_done` is called)
-  if (_mi_heap_init()) return;  // returns true if already initialized
+  if (_mi_thread_heap_init()) return;  // returns true if already initialized
 
   _mi_stat_increase(&_mi_stats_main.threads, 1);
+  mi_atomic_increment_relaxed(&thread_count);
   //_mi_verbose_message("thread init: 0x%zx\n", _mi_thread_id());
 }
 
 void mi_thread_done(void) mi_attr_noexcept {
-  _mi_thread_done(mi_get_default_heap());
+  _mi_thread_done(NULL);
 }
 
-static void _mi_thread_done(mi_heap_t* heap) {
+void _mi_thread_done(mi_heap_t* heap)
+{
+  // calling with NULL implies using the default heap
+  if (heap == NULL) {
+    heap = mi_prim_get_default_heap();
+    if (heap == NULL) return;
+  }
+
+  // prevent re-entrancy through heap_done/heap_set_default_direct (issue #699)
+  if (!mi_heap_is_initialized(heap)) {
+    return;
+  }
+
+  // adjust stats
+  mi_atomic_decrement_relaxed(&thread_count);
   _mi_stat_decrease(&_mi_stats_main.threads, 1);
 
   // check thread-id as on Windows shutdown with FLS the main (exit) thread may call this on thread-local heaps...
   if (heap->thread_id != _mi_thread_id()) return;
-  
+
   // abandon the thread local heap
-  if (_mi_heap_done(heap)) return;  // returns true if already ran
+  if (_mi_thread_heap_done(heap)) return;  // returns true if already ran
 }
 
 void _mi_heap_set_default_direct(mi_heap_t* heap)  {
   mi_assert_internal(heap != NULL);
   #if defined(MI_TLS_SLOT)
-  mi_tls_slot_set(MI_TLS_SLOT,heap);
+  mi_prim_tls_slot_set(MI_TLS_SLOT,heap);
   #elif defined(MI_TLS_PTHREAD_SLOT_OFS)
-  *mi_tls_pthread_heap_slot() = heap;
+  *mi_prim_tls_pthread_heap_slot() = heap;
   #elif defined(MI_TLS_PTHREAD)
   // we use _mi_heap_default_key
   #else
@@ -374,38 +444,29 @@ void _mi_heap_set_default_direct(mi_heap_t* heap)  {
 
   // ensure the default heap is passed to `_mi_thread_done`
   // setting to a non-NULL value also ensures `mi_thread_done` is called.
-  #if defined(_WIN32) && defined(MI_SHARED_LIB)
-    // nothing to do as it is done in DllMain
-  #elif defined(_WIN32) && !defined(MI_SHARED_LIB)
-    mi_assert_internal(mi_fls_key != 0);
-    FlsSetValue(mi_fls_key, heap);
-  #elif defined(MI_USE_PTHREADS)
-  if (_mi_heap_default_key != (pthread_key_t)(-1)) {  // can happen during recursive invocation on freeBSD
-    pthread_setspecific(_mi_heap_default_key, heap);
-  }
-  #endif
+  _mi_prim_thread_associate_default_heap(heap);
 }
 
 
 // --------------------------------------------------------
 // Run functions on process init/done, and thread init/done
 // --------------------------------------------------------
-static void mi_process_done(void);
+static void mi_cdecl mi_process_done(void);
 
 static bool os_preloading = true;    // true until this module is initialized
 static bool mi_redirected = false;   // true if malloc redirects to mi_malloc
 
 // Returns true if this module has not been initialized; Don't use C runtime routines until it returns false.
-bool _mi_preloading(void) {
+bool mi_decl_noinline _mi_preloading(void) {
   return os_preloading;
 }
 
-bool mi_is_redirected(void) mi_attr_noexcept {
+mi_decl_nodiscard bool mi_is_redirected(void) mi_attr_noexcept {
   return mi_redirected;
 }
 
 // Communicate with the redirection module on Windows
-#if defined(_WIN32) && defined(MI_SHARED_LIB)
+#if defined(_WIN32) && defined(MI_SHARED_LIB) && !defined(MI_WIN_NOREDIRECT)
 #ifdef __cplusplus
 extern "C" {
 #endif
@@ -421,8 +482,8 @@ mi_decl_export void _mi_redirect_entry(DWORD reason) {
     mi_thread_done();
   }
 }
-__declspec(dllimport) bool mi_allocator_init(const char** message);
-__declspec(dllimport) void mi_allocator_done(void);
+__declspec(dllimport) bool mi_cdecl mi_allocator_init(const char** message);
+__declspec(dllimport) void mi_cdecl mi_allocator_done(void);
 #ifdef __cplusplus
 }
 #endif
@@ -439,15 +500,18 @@ static void mi_allocator_done(void) {
 // Called once by the process loader
 static void mi_process_load(void) {
   mi_heap_main_init();
-  #if defined(MI_TLS_RECURSE_GUARD)
+  #if defined(__APPLE__) || defined(MI_TLS_RECURSE_GUARD)
   volatile mi_heap_t* dummy = _mi_heap_default; // access TLS to allocate it before setting tls_initialized to true;
-  UNUSED(dummy);
+  if (dummy == NULL) return;                    // use dummy or otherwise the access may get optimized away (issue #697)
   #endif
   os_preloading = false;
+  mi_assert_internal(_mi_is_main_thread());
+  #if !(defined(_WIN32) && defined(MI_SHARED_LIB))  // use Dll process detach (see below) instead of atexit (issue #521)
   atexit(&mi_process_done);
+  #endif
   _mi_options_init();
+  mi_process_setup_auto_thread_done();
   mi_process_init();
-  //mi_stats_reset();-
   if (mi_redirected) _mi_verbose_message("malloc is redirected.\n");
 
   // show message from the redirector (if present)
@@ -456,6 +520,9 @@ static void mi_process_load(void) {
   if (msg != NULL && (mi_option_is_enabled(mi_option_verbose) || mi_option_is_enabled(mi_option_show_errors))) {
     _mi_fputs(NULL,NULL,NULL,msg);
   }
+
+  // reseed random
+  _mi_random_reinit_if_weak(&_mi_heap_main.random);
 }
 
 #if defined(_WIN32) && (defined(_M_IX86) || defined(_M_X64))
@@ -466,7 +533,7 @@ static void mi_detect_cpu_features(void) {
   // FSRM for fast rep movsb support (AMD Zen3+ (~2020) or Intel Ice Lake+ (~2017))
   int32_t cpu_info[4];
   __cpuid(cpu_info, 7);
-  _mi_cpu_has_fsrm = ((cpu_info[3] & (1 << 4)) != 0); // bit 4 of EDX : see <https ://en.wikipedia.org/wiki/CPUID#EAX=7,_ECX=0:_Extended_Features>
+  _mi_cpu_has_fsrm = ((cpu_info[3] & (1 << 4)) != 0); // bit 4 of EDX : see <https://en.wikipedia.org/wiki/CPUID#EAX=7,_ECX=0:_Extended_Features>
 }
 #else
 static void mi_detect_cpu_features(void) {
@@ -477,33 +544,57 @@ static void mi_detect_cpu_features(void) {
 // Initialize the process; called by thread_init or the process loader
 void mi_process_init(void) mi_attr_noexcept {
   // ensure we are called once
-  if (_mi_process_is_initialized) return;
+  static mi_atomic_once_t process_init;
+	#if _MSC_VER < 1920
+	mi_heap_main_init(); // vs2017 can dynamically re-initialize _mi_heap_main
+	#endif
+  if (!mi_atomic_once(&process_init)) return;
   _mi_process_is_initialized = true;
+  _mi_verbose_message("process init: 0x%zx\n", _mi_thread_id());
   mi_process_setup_auto_thread_done();
 
-  _mi_verbose_message("process init: 0x%zx\n", _mi_thread_id());
   mi_detect_cpu_features();
   _mi_os_init();
   mi_heap_main_init();
-  #if (MI_DEBUG)
+  #if MI_DEBUG
   _mi_verbose_message("debug level : %d\n", MI_DEBUG);
   #endif
   _mi_verbose_message("secure level: %d\n", MI_SECURE);
+  _mi_verbose_message("mem tracking: %s\n", MI_TRACK_TOOL);
+  #if MI_TSAN
+  _mi_verbose_message("thread santizer enabled\n");
+  #endif
   mi_thread_init();
+
+  #if defined(_WIN32)
+  // On windows, when building as a static lib the FLS cleanup happens to early for the main thread.
+  // To avoid this, set the FLS value for the main thread to NULL so the fls cleanup
+  // will not call _mi_thread_done on the (still executing) main thread. See issue #508.
+  _mi_prim_thread_associate_default_heap(NULL);
+  #endif
+
   mi_stats_reset();  // only call stat reset *after* thread init (or the heap tld == NULL)
+  mi_track_init();
 
   if (mi_option_is_enabled(mi_option_reserve_huge_os_pages)) {
-    size_t pages = mi_option_get(mi_option_reserve_huge_os_pages);
-    mi_reserve_huge_os_pages_interleave(pages, 0, pages*500);
-  } 
+    size_t pages = mi_option_get_clamp(mi_option_reserve_huge_os_pages, 0, 128*1024);
+    long reserve_at = mi_option_get(mi_option_reserve_huge_os_pages_at);
+    if (reserve_at != -1) {
+      mi_reserve_huge_os_pages_at(pages, reserve_at, pages*500);
+    } else {
+      mi_reserve_huge_os_pages_interleave(pages, 0, pages*500);
+    }
+  }
   if (mi_option_is_enabled(mi_option_reserve_os_memory)) {
     long ksize = mi_option_get(mi_option_reserve_os_memory);
-    if (ksize > 0) mi_reserve_os_memory((size_t)ksize*KiB, true, true);
+    if (ksize > 0) {
+      mi_reserve_os_memory((size_t)ksize*MI_KiB, true, true);
+    }
   }
 }
 
 // Called when the process is done (through `at_exit`)
-static void mi_process_done(void) {
+static void mi_cdecl mi_process_done(void) {
   // only shutdown if we were initialized
   if (!_mi_process_is_initialized) return;
   // ensure we are called once
@@ -511,22 +602,31 @@ static void mi_process_done(void) {
   if (process_done) return;
   process_done = true;
 
-  #if defined(_WIN32) && !defined(MI_SHARED_LIB)
-  FlsSetValue(mi_fls_key, NULL);  // don't call main-thread callback
-  FlsFree(mi_fls_key);            // call thread-done on all threads to prevent dangling callback pointer if statically linked with a DLL; Issue #208
-  #endif
-  
-  #if (MI_DEBUG != 0) || !defined(MI_SHARED_LIB)  
-  // free all memory if possible on process exit. This is not needed for a stand-alone process
-  // but should be done if mimalloc is statically linked into another shared library which
-  // is repeatedly loaded/unloaded, see issue #281.
-  mi_collect(true /* force */ );
+  // release any thread specific resources and ensure _mi_thread_done is called on all but the main thread
+  _mi_prim_thread_done_auto_done();
+
+  #ifndef MI_SKIP_COLLECT_ON_EXIT
+    #if (MI_DEBUG || !defined(MI_SHARED_LIB))
+    // free all memory if possible on process exit. This is not needed for a stand-alone process
+    // but should be done if mimalloc is statically linked into another shared library which
+    // is repeatedly loaded/unloaded, see issue #281.
+    mi_collect(true /* force */ );
+    #endif
   #endif
 
+  // Forcefully release all retained memory; this can be dangerous in general if overriding regular malloc/free
+  // since after process_done there might still be other code running that calls `free` (like at_exit routines,
+  // or C-runtime termination code.
+  if (mi_option_is_enabled(mi_option_destroy_on_exit)) {
+    mi_collect(true /* force */);
+    _mi_heap_unsafe_destroy_all();     // forcefully release all memory held by all heaps (of this thread only!)
+    _mi_arena_unsafe_destroy_all(& _mi_heap_main_get()->tld->stats);
+  }
+
   if (mi_option_is_enabled(mi_option_show_stats) || mi_option_is_enabled(mi_option_verbose)) {
     mi_stats_print(NULL);
   }
-  mi_allocator_done();  
+  mi_allocator_done();
   _mi_verbose_message("process done: 0x%zx\n", _mi_heap_main.thread_id);
   os_preloading = true; // don't call the C runtime anymore
 }
@@ -536,31 +636,22 @@ static void mi_process_done(void) {
 #if defined(_WIN32) && defined(MI_SHARED_LIB)
   // Windows DLL: easy to hook into process_init and thread_done
   __declspec(dllexport) BOOL WINAPI DllMain(HINSTANCE inst, DWORD reason, LPVOID reserved) {
-    UNUSED(reserved);
-    UNUSED(inst);
+    MI_UNUSED(reserved);
+    MI_UNUSED(inst);
     if (reason==DLL_PROCESS_ATTACH) {
       mi_process_load();
     }
+    else if (reason==DLL_PROCESS_DETACH) {
+      mi_process_done();
+    }
     else if (reason==DLL_THREAD_DETACH) {
-      if (!mi_is_redirected()) mi_thread_done();
+      if (!mi_is_redirected()) {
+        mi_thread_done();
+      }
     }
     return TRUE;
   }
 
-#elif defined(__cplusplus)
-  // C++: use static initialization to detect process start
-  static bool _mi_process_init(void) {
-    mi_process_load();
-    return (_mi_heap_main.thread_id != 0);
-  }
-  static bool mi_initialized = _mi_process_init();
-
-#elif defined(__GNUC__) || defined(__clang__)
-  // GCC,Clang: use the constructor attribute
-  static void __attribute__((constructor)) _mi_process_init(void) {
-    mi_process_load();
-  }
-
 #elif defined(_MSC_VER)
   // MSVC: use data section magic for static libraries
   // See <https://www.codeguru.com/cpp/misc/misc/applicationcontrol/article.php/c6945/Running-Code-Before-and-After-Main.htm>
@@ -568,17 +659,31 @@ static void mi_process_done(void) {
     mi_process_load();
     return 0;
   }
-  typedef int(*_crt_cb)(void);
-  #ifdef _M_X64
+  typedef int(*_mi_crt_callback_t)(void);
+  #if defined(_M_X64) || defined(_M_ARM64)
     __pragma(comment(linker, "/include:" "_mi_msvc_initu"))
     #pragma section(".CRT$XIU", long, read)
   #else
     __pragma(comment(linker, "/include:" "__mi_msvc_initu"))
   #endif
   #pragma data_seg(".CRT$XIU")
-  _crt_cb _mi_msvc_initu[] = { &_mi_process_init };
+  mi_decl_externc _mi_crt_callback_t _mi_msvc_initu[] = { &_mi_process_init };
   #pragma data_seg()
 
+#elif defined(__cplusplus)
+  // C++: use static initialization to detect process start
+  static bool _mi_process_init(void) {
+    mi_process_load();
+    return (_mi_heap_main.thread_id != 0);
+  }
+  static bool mi_initialized = _mi_process_init();
+
+#elif defined(__GNUC__) || defined(__clang__)
+  // GCC,Clang: use the constructor attribute
+  static void __attribute__((constructor)) _mi_process_init(void) {
+    mi_process_load();
+  }
+
 #else
 #pragma message("define a way to call mi_process_load on your platform")
 #endif
diff --git a/contrib/libs/mimalloc/src/libc.c b/contrib/libs/mimalloc/src/libc.c
new file mode 100644
index 000000000000..dd6b40073790
--- /dev/null
+++ b/contrib/libs/mimalloc/src/libc.c
@@ -0,0 +1,273 @@
+/* ----------------------------------------------------------------------------
+Copyright (c) 2018-2023, Microsoft Research, Daan Leijen
+This is free software; you can redistribute it and/or modify it under the
+terms of the MIT license. A copy of the license can be found in the file
+"LICENSE" at the root of this distribution.
+-----------------------------------------------------------------------------*/
+
+// --------------------------------------------------------
+// This module defines various std libc functions to reduce
+// the dependency on libc, and also prevent errors caused 
+// by some libc implementations when called before `main`
+// executes (due to malloc redirection)
+// --------------------------------------------------------
+
+#include "mimalloc.h"
+#include "mimalloc/internal.h"
+#include "mimalloc/prim.h"      // mi_prim_getenv
+
+char _mi_toupper(char c) {
+  if (c >= 'a' && c <= 'z') return (c - 'a' + 'A');
+                       else return c;
+}
+
+int _mi_strnicmp(const char* s, const char* t, size_t n) {
+  if (n == 0) return 0;
+  for (; *s != 0 && *t != 0 && n > 0; s++, t++, n--) {
+    if (_mi_toupper(*s) != _mi_toupper(*t)) break;
+  }
+  return (n == 0 ? 0 : *s - *t);
+}
+
+void _mi_strlcpy(char* dest, const char* src, size_t dest_size) {
+  if (dest==NULL || src==NULL || dest_size == 0) return;
+  // copy until end of src, or when dest is (almost) full
+  while (*src != 0 && dest_size > 1) {
+    *dest++ = *src++;
+    dest_size--;
+  }
+  // always zero terminate
+  *dest = 0;
+}
+
+void _mi_strlcat(char* dest, const char* src, size_t dest_size) {
+  if (dest==NULL || src==NULL || dest_size == 0) return;
+  // find end of string in the dest buffer
+  while (*dest != 0 && dest_size > 1) {
+    dest++;
+    dest_size--;
+  }
+  // and catenate
+  _mi_strlcpy(dest, src, dest_size);
+}
+
+size_t _mi_strlen(const char* s) {
+  if (s==NULL) return 0;
+  size_t len = 0;
+  while(s[len] != 0) { len++; }
+  return len;
+}
+
+size_t _mi_strnlen(const char* s, size_t max_len) {
+  if (s==NULL) return 0;
+  size_t len = 0;
+  while(s[len] != 0 && len < max_len) { len++; }
+  return len;
+}
+
+#ifdef MI_NO_GETENV
+bool _mi_getenv(const char* name, char* result, size_t result_size) {
+  MI_UNUSED(name);
+  MI_UNUSED(result);
+  MI_UNUSED(result_size);
+  return false;
+}
+#else
+bool _mi_getenv(const char* name, char* result, size_t result_size) {
+  if (name==NULL || result == NULL || result_size < 64) return false;
+  return _mi_prim_getenv(name,result,result_size);
+}
+#endif
+
+// --------------------------------------------------------
+// Define our own limited `_mi_vsnprintf` and `_mi_snprintf`
+// This is mostly to avoid calling these when libc is not yet
+// initialized (and to reduce dependencies)
+// 
+// format:      d i, p x u, s
+// prec:        z l ll L
+// width:       10
+// align-left:  -
+// fill:        0
+// plus:        +
+// --------------------------------------------------------
+
+static void mi_outc(char c, char** out, char* end) {
+  char* p = *out;
+  if (p >= end) return;
+  *p = c;
+  *out = p + 1;
+}
+
+static void mi_outs(const char* s, char** out, char* end) {
+  if (s == NULL) return;
+  char* p = *out;
+  while (*s != 0 && p < end) {
+    *p++ = *s++;
+  }
+  *out = p;
+}
+
+static void mi_out_fill(char fill, size_t len, char** out, char* end) {
+  char* p = *out;
+  for (size_t i = 0; i < len && p < end; i++) {
+    *p++ = fill;
+  }
+  *out = p;
+}
+
+static void mi_out_alignright(char fill, char* start, size_t len, size_t extra, char* end) {
+  if (len == 0 || extra == 0) return;
+  if (start + len + extra >= end) return;
+  // move `len` characters to the right (in reverse since it can overlap)
+  for (size_t i = 1; i <= len; i++) {
+    start[len + extra - i] = start[len - i];
+  }
+  // and fill the start
+  for (size_t i = 0; i < extra; i++) {
+    start[i] = fill;
+  }
+}
+
+
+static void mi_out_num(uintptr_t x, size_t base, char prefix, char** out, char* end) 
+{
+  if (x == 0 || base == 0 || base > 16) {
+    if (prefix != 0) { mi_outc(prefix, out, end); }
+    mi_outc('0',out,end);
+  }
+  else {
+    // output digits in reverse
+    char* start = *out;
+    while (x > 0) {
+      char digit = (char)(x % base);
+      mi_outc((digit <= 9 ? '0' + digit : 'A' + digit - 10),out,end);
+      x = x / base;
+    }
+    if (prefix != 0) { 
+      mi_outc(prefix, out, end); 
+    }
+    size_t len = *out - start;
+    // and reverse in-place
+    for (size_t i = 0; i < (len / 2); i++) {
+      char c = start[len - i - 1];
+      start[len - i - 1] = start[i];
+      start[i] = c;
+    }
+  }
+}
+
+
+#define MI_NEXTC()  c = *in; if (c==0) break; in++;
+
+void _mi_vsnprintf(char* buf, size_t bufsize, const char* fmt, va_list args) {
+  if (buf == NULL || bufsize == 0 || fmt == NULL) return;
+  buf[bufsize - 1] = 0;
+  char* const end = buf + (bufsize - 1);
+  const char* in = fmt;
+  char* out = buf;
+  while (true) {
+    if (out >= end) break;
+    char c;
+    MI_NEXTC();
+    if (c != '%') {
+      if ((c >= ' ' && c <= '~') || c=='\n' || c=='\r' || c=='\t') { // output visible ascii or standard control only
+        mi_outc(c, &out, end);
+      }
+    }
+    else {
+      MI_NEXTC();
+      char   fill = ' ';
+      size_t width = 0;
+      char   numtype = 'd';
+      char   numplus = 0;
+      bool   alignright = true; 
+      if (c == '+' || c == ' ') { numplus = c; MI_NEXTC(); }
+      if (c == '-') { alignright = false; MI_NEXTC(); }
+      if (c == '0') { fill = '0'; MI_NEXTC(); }
+      if (c >= '1' && c <= '9') {
+        width = (c - '0'); MI_NEXTC();
+        while (c >= '0' && c <= '9') {
+          width = (10 * width) + (c - '0'); MI_NEXTC();
+        }
+        if (c == 0) break;  // extra check due to while
+      }      
+      if (c == 'z' || c == 't' || c == 'L') { numtype = c; MI_NEXTC(); }
+      else if (c == 'l') {
+        numtype = c; MI_NEXTC();
+        if (c == 'l') { numtype = 'L'; MI_NEXTC(); }
+      }
+
+      char* start = out;
+      if (c == 's') {
+        // string
+        const char* s = va_arg(args, const char*);
+        mi_outs(s, &out, end);
+      }
+      else if (c == 'p' || c == 'x' || c == 'u') {
+        // unsigned
+        uintptr_t x = 0;
+        if (c == 'x' || c == 'u') {
+          if (numtype == 'z')       x = va_arg(args, size_t);
+          else if (numtype == 't')  x = va_arg(args, uintptr_t); // unsigned ptrdiff_t
+          else if (numtype == 'L')  x = (uintptr_t)va_arg(args, unsigned long long);
+                               else x = va_arg(args, unsigned long);
+        }
+        else if (c == 'p') {
+          x = va_arg(args, uintptr_t);
+          mi_outs("0x", &out, end);
+          start = out;
+          width = (width >= 2 ? width - 2 : 0);
+        }
+        if (width == 0 && (c == 'x' || c == 'p')) {
+          if (c == 'p')   { width = 2 * (x <= UINT32_MAX ? 4 : ((x >> 16) <= UINT32_MAX ? 6 : sizeof(void*))); }
+          if (width == 0) { width = 2; }
+          fill = '0';
+        }
+        mi_out_num(x, (c == 'x' || c == 'p' ? 16 : 10), numplus, &out, end);
+      }
+      else if (c == 'i' || c == 'd') {
+        // signed
+        intptr_t x = 0;
+        if (numtype == 'z')       x = va_arg(args, intptr_t );
+        else if (numtype == 't')  x = va_arg(args, ptrdiff_t);
+        else if (numtype == 'L')  x = (intptr_t)va_arg(args, long long);
+                             else x = va_arg(args, long);
+        char pre = 0;
+        if (x < 0) {
+          pre = '-';
+          if (x > INTPTR_MIN) { x = -x; }
+        }
+        else if (numplus != 0) {
+          pre = numplus;
+        }
+        mi_out_num((uintptr_t)x, 10, pre, &out, end);
+      }
+      else if (c >= ' ' && c <= '~') {
+        // unknown format
+        mi_outc('%', &out, end);
+        mi_outc(c, &out, end);
+      }
+
+      // fill & align
+      mi_assert_internal(out <= end);
+      mi_assert_internal(out >= start);
+      const size_t len = out - start;
+      if (len < width) {
+        mi_out_fill(fill, width - len, &out, end);
+        if (alignright && out <= end) {
+          mi_out_alignright(fill, start, len, width - len, end);
+        }
+      }
+    }
+  }
+  mi_assert_internal(out <= end);
+  *out = 0;
+}
+
+void _mi_snprintf(char* buf, size_t buflen, const char* fmt, ...) {
+  va_list args;
+  va_start(args, fmt);
+  _mi_vsnprintf(buf, buflen, fmt, args);
+  va_end(args);
+}
diff --git a/contrib/libs/mimalloc/src/options.c b/contrib/libs/mimalloc/src/options.c
index 30025db226fe..db6e040fe8df 100644
--- a/contrib/libs/mimalloc/src/options.c
+++ b/contrib/libs/mimalloc/src/options.c
@@ -5,32 +5,24 @@ terms of the MIT license. A copy of the license can be found in the file
 "LICENSE" at the root of this distribution.
 -----------------------------------------------------------------------------*/
 #include "mimalloc.h"
-#include "mimalloc-internal.h"
-#include "mimalloc-atomic.h"
+#include "mimalloc/internal.h"
+#include "mimalloc/atomic.h"
+#include "mimalloc/prim.h"  // mi_prim_out_stderr
 
-#include <stdio.h>
-#include <stdlib.h> // strtol
-#include <string.h> // strncpy, strncat, strlen, strstr
-#include <ctype.h>  // toupper
-#include <stdarg.h>
+#include <stdio.h>      // stdin/stdout
+#include <stdlib.h>     // abort
 
-#ifdef _MSC_VER
-#pragma warning(disable:4996)   // strncpy, strncat
-#endif
 
 
-static uintptr_t mi_max_error_count   = 16; // stop outputting errors after this
-static uintptr_t mi_max_warning_count = 16; // stop outputting warnings after this
+static long mi_max_error_count   = 16; // stop outputting errors after this (use < 0 for no limit)
+static long mi_max_warning_count = 16; // stop outputting warnings after this (use < 0 for no limit)
 
-static void mi_add_stderr_output();
+static void mi_add_stderr_output(void);
 
 int mi_version(void) mi_attr_noexcept {
   return MI_MALLOC_VERSION;
 }
 
-#ifdef _WIN32
-#include <conio.h>
-#endif
 
 // --------------------------------------------------------
 // Options
@@ -49,10 +41,11 @@ typedef struct mi_option_desc_s {
   mi_init_t   init;   // is it initialized yet? (from the environment)
   mi_option_t option; // for debugging: the option index should match the option
   const char* name;   // option name without `mimalloc_` prefix
+  const char* legacy_name; // potential legacy option name
 } mi_option_desc_t;
 
-#define MI_OPTION(opt)        mi_option_##opt, #opt
-#define MI_OPTION_DESC(opt)   {0, UNINIT, MI_OPTION(opt) }
+#define MI_OPTION(opt)                  mi_option_##opt, #opt, NULL
+#define MI_OPTION_LEGACY(opt,legacy)    mi_option_##opt, #opt, #legacy
 
 static mi_option_desc_t options[_mi_option_last] =
 {
@@ -66,65 +59,95 @@ static mi_option_desc_t options[_mi_option_last] =
   { 0, UNINIT, MI_OPTION(verbose) },
 
   // the following options are experimental and not all combinations make sense.
-  { 1, UNINIT, MI_OPTION(eager_commit) },        // commit per segment directly (4MiB)  (but see also `eager_commit_delay`)
-  #if defined(_WIN32) || (MI_INTPTR_SIZE <= 4)   // and other OS's without overcommit?
-  { 0, UNINIT, MI_OPTION(eager_region_commit) },
-  { 1, UNINIT, MI_OPTION(reset_decommits) },     // reset decommits memory
-  #else
-  { 1, UNINIT, MI_OPTION(eager_region_commit) },
-  { 0, UNINIT, MI_OPTION(reset_decommits) },     // reset uses MADV_FREE/MADV_DONTNEED
-  #endif
-  { 0, UNINIT, MI_OPTION(large_os_pages) },      // use large OS pages, use only with eager commit to prevent fragmentation of VMA's
-  { 0, UNINIT, MI_OPTION(reserve_huge_os_pages) },  // per 1GiB huge pages
-  { 0, UNINIT, MI_OPTION(reserve_os_memory)     },
-  { 0, UNINIT, MI_OPTION(segment_cache) },       // cache N segments per thread
-  { 1, UNINIT, MI_OPTION(page_reset) },          // reset page memory on free
-  { 0, UNINIT, MI_OPTION(abandoned_page_reset) },// reset free page memory when a thread terminates
-  { 0, UNINIT, MI_OPTION(segment_reset) },       // reset segment memory on free (needs eager commit)
+  { 1, UNINIT, MI_OPTION(eager_commit) },               // commit per segment directly (4MiB)  (but see also `eager_commit_delay`)
+  { 2, UNINIT, MI_OPTION_LEGACY(arena_eager_commit,eager_region_commit) }, // eager commit arena's? 2 is used to enable this only on an OS that has overcommit (i.e. linux)
+  { 1, UNINIT, MI_OPTION_LEGACY(purge_decommits,reset_decommits) },        // purge decommits memory (instead of reset) (note: on linux this uses MADV_DONTNEED for decommit)
+  { 0, UNINIT, MI_OPTION_LEGACY(allow_large_os_pages,large_os_pages) },    // use large OS pages, use only with eager commit to prevent fragmentation of VMA's
+  { 0, UNINIT, MI_OPTION(reserve_huge_os_pages) },      // per 1GiB huge pages
+  {-1, UNINIT, MI_OPTION(reserve_huge_os_pages_at) },   // reserve huge pages at node N
+  { 0, UNINIT, MI_OPTION(reserve_os_memory)     },      // reserve N KiB OS memory in advance (use `option_get_size`)
+  { 0, UNINIT, MI_OPTION(deprecated_segment_cache) },   // cache N segments per thread
+  { 0, UNINIT, MI_OPTION(deprecated_page_reset) },      // reset page memory on free
+  { 0, UNINIT, MI_OPTION(abandoned_page_purge) },       // purge free page memory when a thread terminates
+  { 0, UNINIT, MI_OPTION(deprecated_segment_reset) },   // reset segment memory on free (needs eager commit)
 #if defined(__NetBSD__)
-  { 0, UNINIT, MI_OPTION(eager_commit_delay) },  // the first N segments per thread are not eagerly committed
+  { 0, UNINIT, MI_OPTION(eager_commit_delay) },         // the first N segments per thread are not eagerly committed
 #else
-  { 1, UNINIT, MI_OPTION(eager_commit_delay) },  // the first N segments per thread are not eagerly committed (but per page in the segment on demand)
+  { 1, UNINIT, MI_OPTION(eager_commit_delay) },         // the first N segments per thread are not eagerly committed (but per page in the segment on demand)
 #endif
-  { 100, UNINIT, MI_OPTION(reset_delay) },       // reset delay in milli-seconds
-  { 0,   UNINIT, MI_OPTION(use_numa_nodes) },    // 0 = use available numa nodes, otherwise use at most N nodes.
-  { 0,   UNINIT, MI_OPTION(limit_os_alloc) },    // 1 = do not use OS memory for allocation (but only reserved arenas)
-  { 100, UNINIT, MI_OPTION(os_tag) },            // only apple specific for now but might serve more or less related purpose
-  { 16,  UNINIT, MI_OPTION(max_errors) },        // maximum errors that are output
-  { 16,  UNINIT, MI_OPTION(max_warnings) }       // maximum warnings that are output
+  { 10,  UNINIT, MI_OPTION_LEGACY(purge_delay,reset_delay) },  // purge delay in milli-seconds
+  { 0,   UNINIT, MI_OPTION(use_numa_nodes) },           // 0 = use available numa nodes, otherwise use at most N nodes.
+  { 0,   UNINIT, MI_OPTION_LEGACY(disallow_os_alloc,limit_os_alloc) },           // 1 = do not use OS memory for allocation (but only reserved arenas)
+  { 100, UNINIT, MI_OPTION(os_tag) },                   // only apple specific for now but might serve more or less related purpose
+  { 32,  UNINIT, MI_OPTION(max_errors) },               // maximum errors that are output
+  { 32,  UNINIT, MI_OPTION(max_warnings) },             // maximum warnings that are output
+  { 10,  UNINIT, MI_OPTION(max_segment_reclaim)},       // max. percentage of the abandoned segments to be reclaimed per try.
+  { 0,   UNINIT, MI_OPTION(destroy_on_exit)},           // release all OS memory on process exit; careful with dangling pointer or after-exit frees!
+  #if (MI_INTPTR_SIZE>4)
+  { 1024L*1024L, UNINIT, MI_OPTION(arena_reserve) },    // reserve memory N KiB at a time (=1GiB) (use `option_get_size`)
+  #else
+  {  128L*1024L, UNINIT, MI_OPTION(arena_reserve) },    // =128MiB on 32-bit
+  #endif
 
+  { 10,  UNINIT, MI_OPTION(arena_purge_mult) },         // purge delay multiplier for arena's
+  { 1,   UNINIT, MI_OPTION_LEGACY(purge_extend_delay, decommit_extend_delay) },
+  { 1,   UNINIT, MI_OPTION(abandoned_reclaim_on_free) },// reclaim an abandoned segment on a free
+  { 0,   UNINIT, MI_OPTION(disallow_arena_alloc) },     // 1 = do not use arena's for allocation (except if using specific arena id's)
+  { 400, UNINIT, MI_OPTION(retry_on_oom) },             // windows only: retry on out-of-memory for N milli seconds (=400), set to 0 to disable retries.
 };
 
 static void mi_option_init(mi_option_desc_t* desc);
 
+static bool mi_option_has_size_in_kib(mi_option_t option) {
+  return (option == mi_option_reserve_os_memory || option == mi_option_arena_reserve);
+}
+
 void _mi_options_init(void) {
   // called on process load; should not be called before the CRT is initialized!
   // (e.g. do not call this from process_init as that may run before CRT initialization)
   mi_add_stderr_output(); // now it safe to use stderr for output
   for(int i = 0; i < _mi_option_last; i++ ) {
     mi_option_t option = (mi_option_t)i;
-    long l = mi_option_get(option); UNUSED(l); // initialize
-    if (option != mi_option_verbose) {
+    long l = mi_option_get(option); MI_UNUSED(l); // initialize
+    // if (option != mi_option_verbose)
+    {
       mi_option_desc_t* desc = &options[option];
-      _mi_verbose_message("option '%s': %ld\n", desc->name, desc->value);
+      _mi_verbose_message("option '%s': %ld %s\n", desc->name, desc->value, (mi_option_has_size_in_kib(option) ? "KiB" : ""));
     }
   }
   mi_max_error_count = mi_option_get(mi_option_max_errors);
   mi_max_warning_count = mi_option_get(mi_option_max_warnings);
 }
 
-long mi_option_get(mi_option_t option) {
+mi_decl_nodiscard long mi_option_get(mi_option_t option) {
   mi_assert(option >= 0 && option < _mi_option_last);
+  if (option < 0 || option >= _mi_option_last) return 0;
   mi_option_desc_t* desc = &options[option];
   mi_assert(desc->option == option);  // index should match the option
-  if (mi_unlikely(desc->init == UNINIT)) {
+  if mi_unlikely(desc->init == UNINIT) {
     mi_option_init(desc);
   }
   return desc->value;
 }
 
+mi_decl_nodiscard long mi_option_get_clamp(mi_option_t option, long min, long max) {
+  long x = mi_option_get(option);
+  return (x < min ? min : (x > max ? max : x));
+}
+
+mi_decl_nodiscard size_t mi_option_get_size(mi_option_t option) {
+  mi_assert_internal(mi_option_has_size_in_kib(option));
+  const long x = mi_option_get(option);
+  size_t size = (x < 0 ? 0 : (size_t)x);
+  if (mi_option_has_size_in_kib(option)) {
+    size *= MI_KiB;
+  }
+  return size;
+}
+
 void mi_option_set(mi_option_t option, long value) {
   mi_assert(option >= 0 && option < _mi_option_last);
+  if (option < 0 || option >= _mi_option_last) return;
   mi_option_desc_t* desc = &options[option];
   mi_assert(desc->option == option);  // index should match the option
   desc->value = value;
@@ -133,13 +156,14 @@ void mi_option_set(mi_option_t option, long value) {
 
 void mi_option_set_default(mi_option_t option, long value) {
   mi_assert(option >= 0 && option < _mi_option_last);
+  if (option < 0 || option >= _mi_option_last) return;
   mi_option_desc_t* desc = &options[option];
   if (desc->init != INITIALIZED) {
     desc->value = value;
   }
 }
 
-bool mi_option_is_enabled(mi_option_t option) {
+mi_decl_nodiscard bool mi_option_is_enabled(mi_option_t option) {
   return (mi_option_get(option) != 0);
 }
 
@@ -159,16 +183,11 @@ void mi_option_disable(mi_option_t option) {
   mi_option_set_enabled(option,false);
 }
 
-
-static void mi_out_stderr(const char* msg, void* arg) {
-  UNUSED(arg);
-  #ifdef _WIN32
-  // on windows with redirection, the C runtime cannot handle locale dependent output
-  // after the main thread closes so we use direct console output.
-  if (!_mi_preloading()) { _cputs(msg); }
-  #else
-  fputs(msg, stderr);
-  #endif
+static void mi_cdecl mi_out_stderr(const char* msg, void* arg) {
+  MI_UNUSED(arg);
+  if (msg != NULL && msg[0] != 0) {
+    _mi_prim_out_stderr(msg);
+  }
 }
 
 // Since an output function can be registered earliest in the `main`
@@ -176,19 +195,19 @@ static void mi_out_stderr(const char* msg, void* arg) {
 // an output function is registered it is called immediately with
 // the output up to that point.
 #ifndef MI_MAX_DELAY_OUTPUT
-#define MI_MAX_DELAY_OUTPUT ((uintptr_t)(32*1024))
+#define MI_MAX_DELAY_OUTPUT ((size_t)(32*1024))
 #endif
 static char out_buf[MI_MAX_DELAY_OUTPUT+1];
-static _Atomic(uintptr_t) out_len;
+static _Atomic(size_t) out_len;
 
-static void mi_out_buf(const char* msg, void* arg) {
-  UNUSED(arg);
+static void mi_cdecl mi_out_buf(const char* msg, void* arg) {
+  MI_UNUSED(arg);
   if (msg==NULL) return;
   if (mi_atomic_load_relaxed(&out_len)>=MI_MAX_DELAY_OUTPUT) return;
-  size_t n = strlen(msg);
+  size_t n = _mi_strlen(msg);
   if (n==0) return;
   // claim space
-  uintptr_t start = mi_atomic_add_acq_rel(&out_len, n);
+  size_t start = mi_atomic_add_acq_rel(&out_len, n);
   if (start >= MI_MAX_DELAY_OUTPUT) return;
   // check bound
   if (start+n >= MI_MAX_DELAY_OUTPUT) {
@@ -213,7 +232,7 @@ static void mi_out_buf_flush(mi_output_fun* out, bool no_more_buf, void* arg) {
 
 // Once this module is loaded, switch to this routine
 // which outputs to stderr and the delayed output buffer.
-static void mi_out_buf_stderr(const char* msg, void* arg) {
+static void mi_cdecl mi_out_buf_stderr(const char* msg, void* arg) {
   mi_out_stderr(msg,arg);
   mi_out_buf(msg,arg);
 }
@@ -242,7 +261,7 @@ void mi_register_output(mi_output_fun* out, void* arg) mi_attr_noexcept {
 }
 
 // add stderr to the delayed output after the module is loaded
-static void mi_add_stderr_output() {
+static void mi_add_stderr_output(void) {
   mi_assert_internal(mi_out_default == NULL);
   mi_out_buf_flush(&mi_out_stderr, false, NULL); // flush current contents to stderr
   mi_out_default = &mi_out_buf_stderr;           // and add stderr to the delayed output
@@ -251,31 +270,46 @@ static void mi_add_stderr_output() {
 // --------------------------------------------------------
 // Messages, all end up calling `_mi_fputs`.
 // --------------------------------------------------------
-static _Atomic(uintptr_t) error_count;   // = 0;  // when >= max_error_count stop emitting errors
-static _Atomic(uintptr_t) warning_count; // = 0;  // when >= max_warning_count stop emitting warnings
+static _Atomic(size_t) error_count;   // = 0;  // when >= max_error_count stop emitting errors
+static _Atomic(size_t) warning_count; // = 0;  // when >= max_warning_count stop emitting warnings
 
 // When overriding malloc, we may recurse into mi_vfprintf if an allocation
 // inside the C runtime causes another message.
+// In some cases (like on macOS) the loader already allocates which
+// calls into mimalloc; if we then access thread locals (like `recurse`)
+// this may crash as the access may call _tlv_bootstrap that tries to
+// (recursively) invoke malloc again to allocate space for the thread local
+// variables on demand. This is why we use a _mi_preloading test on such
+// platforms. However, C code generator may move the initial thread local address
+// load before the `if` and we therefore split it out in a separate funcion.
 static mi_decl_thread bool recurse = false;
 
-static bool mi_recurse_enter(void) {
-  #if defined(__APPLE__) || defined(MI_TLS_RECURSE_GUARD)
-  if (_mi_preloading()) return true;
-  #endif
+static mi_decl_noinline bool mi_recurse_enter_prim(void) {
   if (recurse) return false;
   recurse = true;
   return true;
 }
 
+static mi_decl_noinline void mi_recurse_exit_prim(void) {
+  recurse = false;
+}
+
+static bool mi_recurse_enter(void) {
+  #if defined(__APPLE__) || defined(MI_TLS_RECURSE_GUARD)
+  if (_mi_preloading()) return false;
+  #endif
+  return mi_recurse_enter_prim();
+}
+
 static void mi_recurse_exit(void) {
   #if defined(__APPLE__) || defined(MI_TLS_RECURSE_GUARD)
   if (_mi_preloading()) return;
   #endif
-  recurse = false;
+  mi_recurse_exit_prim();
 }
 
 void _mi_fputs(mi_output_fun* out, void* arg, const char* prefix, const char* message) {
-  if (out==NULL || (FILE*)out==stdout || (FILE*)out==stderr) { // TODO: use mi_out_stderr for stderr?
+  if (out==NULL || (void*)out==(void*)stdout || (void*)out==(void*)stderr) { // TODO: use mi_out_stderr for stderr?
     if (!mi_recurse_enter()) return;
     out = mi_out_get_default(&arg);
     if (prefix != NULL) out(prefix, arg);
@@ -289,12 +323,12 @@ void _mi_fputs(mi_output_fun* out, void* arg, const char* prefix, const char* me
 }
 
 // Define our own limited `fprintf` that avoids memory allocation.
-// We do this using `snprintf` with a limited buffer.
+// We do this using `_mi_vsnprintf` with a limited buffer.
 static void mi_vfprintf( mi_output_fun* out, void* arg, const char* prefix, const char* fmt, va_list args ) {
   char buf[512];
   if (fmt==NULL) return;
   if (!mi_recurse_enter()) return;
-  vsnprintf(buf,sizeof(buf)-1,fmt,args);
+  _mi_vsnprintf(buf, sizeof(buf)-1, fmt, args);
   mi_recurse_exit();
   _mi_fputs(out,arg,prefix,buf);
 }
@@ -306,11 +340,22 @@ void _mi_fprintf( mi_output_fun* out, void* arg, const char* fmt, ... ) {
   va_end(args);
 }
 
+static void mi_vfprintf_thread(mi_output_fun* out, void* arg, const char* prefix, const char* fmt, va_list args) {
+  if (prefix != NULL && _mi_strnlen(prefix,33) <= 32 && !_mi_is_main_thread()) {
+    char tprefix[64];
+    _mi_snprintf(tprefix, sizeof(tprefix), "%sthread 0x%tx: ", prefix, (uintptr_t)_mi_thread_id());
+    mi_vfprintf(out, arg, tprefix, fmt, args);
+  }
+  else {
+    mi_vfprintf(out, arg, prefix, fmt, args);
+  }
+}
+
 void _mi_trace_message(const char* fmt, ...) {
   if (mi_option_get(mi_option_verbose) <= 1) return;  // only with verbose level 2 or higher
   va_list args;
   va_start(args, fmt);
-  mi_vfprintf(NULL, NULL, "mimalloc: ", fmt, args);
+  mi_vfprintf_thread(NULL, NULL, "mimalloc: ", fmt, args);
   va_end(args);
 }
 
@@ -323,17 +368,21 @@ void _mi_verbose_message(const char* fmt, ...) {
 }
 
 static void mi_show_error_message(const char* fmt, va_list args) {
-  if (!mi_option_is_enabled(mi_option_show_errors) && !mi_option_is_enabled(mi_option_verbose)) return;
-  if (mi_atomic_increment_acq_rel(&error_count) > mi_max_error_count) return;
-  mi_vfprintf(NULL, NULL, "mimalloc: error: ", fmt, args);
+  if (!mi_option_is_enabled(mi_option_verbose)) {
+    if (!mi_option_is_enabled(mi_option_show_errors)) return;
+    if (mi_max_error_count >= 0 && (long)mi_atomic_increment_acq_rel(&error_count) > mi_max_error_count) return;
+  }
+  mi_vfprintf_thread(NULL, NULL, "mimalloc: error: ", fmt, args);
 }
 
 void _mi_warning_message(const char* fmt, ...) {
-  if (!mi_option_is_enabled(mi_option_show_errors) && !mi_option_is_enabled(mi_option_verbose)) return;
-  if (mi_atomic_increment_acq_rel(&warning_count) > mi_max_warning_count) return;
+  if (!mi_option_is_enabled(mi_option_verbose)) {
+    if (!mi_option_is_enabled(mi_option_show_errors)) return;
+    if (mi_max_warning_count >= 0 && (long)mi_atomic_increment_acq_rel(&warning_count) > mi_max_warning_count) return;
+  }
   va_list args;
   va_start(args,fmt);
-  mi_vfprintf(NULL, NULL, "mimalloc: warning: ", fmt, args);
+  mi_vfprintf_thread(NULL, NULL, "mimalloc: warning: ", fmt, args);
   va_end(args);
 }
 
@@ -353,8 +402,8 @@ static mi_error_fun* volatile  mi_error_handler; // = NULL
 static _Atomic(void*) mi_error_arg;     // = NULL
 
 static void mi_error_default(int err) {
-  UNUSED(err);
-#if (MI_DEBUG>0) 
+  MI_UNUSED(err);
+#if (MI_DEBUG>0)
   if (err==EFAULT) {
     #ifdef _MSC_VER
     __debugbreak();
@@ -398,108 +447,34 @@ void _mi_error_message(int err, const char* fmt, ...) {
 // Initialize options by checking the environment
 // --------------------------------------------------------
 
-static void mi_strlcpy(char* dest, const char* src, size_t dest_size) {
-  dest[0] = 0;
-  strncpy(dest, src, dest_size - 1);
-  dest[dest_size - 1] = 0;
-}
+// TODO: implement ourselves to reduce dependencies on the C runtime
+#include <stdlib.h> // strtol
+#include <string.h> // strstr
 
-static void mi_strlcat(char* dest, const char* src, size_t dest_size) {
-  strncat(dest, src, dest_size - 1);
-  dest[dest_size - 1] = 0;
-}
 
-static inline int mi_strnicmp(const char* s, const char* t, size_t n) {
-  if (n==0) return 0;
-  for (; *s != 0 && *t != 0 && n > 0; s++, t++, n--) {
-    if (toupper(*s) != toupper(*t)) break;
-  }
-  return (n==0 ? 0 : *s - *t);
-}
-
-#if defined _WIN32
-// On Windows use GetEnvironmentVariable instead of getenv to work
-// reliably even when this is invoked before the C runtime is initialized.
-// i.e. when `_mi_preloading() == true`.
-// Note: on windows, environment names are not case sensitive.
-#include <windows.h>
-static bool mi_getenv(const char* name, char* result, size_t result_size) {
-  result[0] = 0;
-  size_t len = GetEnvironmentVariableA(name, result, (DWORD)result_size);
-  return (len > 0 && len < result_size);
-}
-#elif !defined(MI_USE_ENVIRON) || (MI_USE_ENVIRON!=0)
-// On Posix systemsr use `environ` to acces environment variables 
-// even before the C runtime is initialized.
-#if defined(__APPLE__) && defined(__has_include) && __has_include(<crt_externs.h>)
-#include <crt_externs.h>
-static char** mi_get_environ(void) {
-  return (*_NSGetEnviron());
-}
-#else 
-extern char** environ;
-static char** mi_get_environ(void) {
-  return environ;
-}
-#endif
-static bool mi_getenv(const char* name, char* result, size_t result_size) {
-  if (name==NULL) return false;  
-  const size_t len = strlen(name);
-  if (len == 0) return false;  
-  char** env = mi_get_environ();
-  if (env == NULL) return false;
-  // compare up to 256 entries
-  for (int i = 0; i < 256 && env[i] != NULL; i++) {
-    const char* s = env[i];
-    if (mi_strnicmp(name, s, len) == 0 && s[len] == '=') { // case insensitive
-      // found it
-      mi_strlcpy(result, s + len + 1, result_size);
-      return true;
-    }
-  }
-  return false;
-}
-#else  
-// fallback: use standard C `getenv` but this cannot be used while initializing the C runtime
-static bool mi_getenv(const char* name, char* result, size_t result_size) {
-  // cannot call getenv() when still initializing the C runtime.
-  if (_mi_preloading()) return false;
-  const char* s = getenv(name);
-  if (s == NULL) {
-    // we check the upper case name too.
-    char buf[64+1];
-    size_t len = strlen(name);
-    if (len >= sizeof(buf)) len = sizeof(buf) - 1;
-    for (size_t i = 0; i < len; i++) {
-      buf[i] = toupper(name[i]);
+static void mi_option_init(mi_option_desc_t* desc) {
+  // Read option value from the environment
+  char s[64 + 1];
+  char buf[64+1];
+  _mi_strlcpy(buf, "mimalloc_", sizeof(buf));
+  _mi_strlcat(buf, desc->name, sizeof(buf));
+  bool found = _mi_getenv(buf, s, sizeof(s));
+  if (!found && desc->legacy_name != NULL) {
+    _mi_strlcpy(buf, "mimalloc_", sizeof(buf));
+    _mi_strlcat(buf, desc->legacy_name, sizeof(buf));
+    found = _mi_getenv(buf, s, sizeof(s));
+    if (found) {
+      _mi_warning_message("environment option \"mimalloc_%s\" is deprecated -- use \"mimalloc_%s\" instead.\n", desc->legacy_name, desc->name);
     }
-    buf[len] = 0;
-    s = getenv(buf);
   }
-  if (s != NULL && strlen(s) < result_size) {
-    mi_strlcpy(result, s, result_size);
-    return true;
-  }
-  else {
-    return false;
-  }
-}
-#endif
 
-static void mi_option_init(mi_option_desc_t* desc) {  
-  // Read option value from the environment
-  char buf[64+1];
-  mi_strlcpy(buf, "mimalloc_", sizeof(buf));
-  mi_strlcat(buf, desc->name, sizeof(buf));
-  char s[64+1];
-  if (mi_getenv(buf, s, sizeof(s))) {
-    size_t len = strlen(s);
-    if (len >= sizeof(buf)) len = sizeof(buf) - 1;
+  if (found) {
+    size_t len = _mi_strnlen(s, sizeof(buf) - 1);
     for (size_t i = 0; i < len; i++) {
-      buf[i] = (char)toupper(s[i]);
+      buf[i] = _mi_toupper(s[i]);
     }
     buf[len] = 0;
-    if (buf[0]==0 || strstr("1;TRUE;YES;ON", buf) != NULL) {
+    if (buf[0] == 0 || strstr("1;TRUE;YES;ON", buf) != NULL) {
       desc->value = 1;
       desc->init = INITIALIZED;
     }
@@ -510,21 +485,38 @@ static void mi_option_init(mi_option_desc_t* desc) {
     else {
       char* end = buf;
       long value = strtol(buf, &end, 10);
-      if (desc->option == mi_option_reserve_os_memory) {
-        // this option is interpreted in KiB to prevent overflow of `long`
+      if (mi_option_has_size_in_kib(desc->option)) {
+        // this option is interpreted in KiB to prevent overflow of `long` for large allocations 
+        // (long is 32-bit on 64-bit windows, which allows for 4TiB max.)
+        size_t size = (value < 0 ? 0 : (size_t)value);
+        bool overflow = false;
         if (*end == 'K') { end++; }
-        else if (*end == 'M') { value *= KiB; end++; }
-        else if (*end == 'G') { value *= MiB; end++; }
-        else { value = (value + KiB - 1) / KiB; }
-        if (*end == 'B') { end++; }
+        else if (*end == 'M') { overflow = mi_mul_overflow(size,MI_KiB,&size); end++; }
+        else if (*end == 'G') { overflow = mi_mul_overflow(size,MI_MiB,&size); end++; }
+        else if (*end == 'T') { overflow = mi_mul_overflow(size,MI_GiB,&size); end++; }
+        else { size = (size + MI_KiB - 1) / MI_KiB; }
+        if (end[0] == 'I' && end[1] == 'B') { end += 2; } // KiB, MiB, GiB, TiB
+        else if (*end == 'B') { end++; }                  // Kb, Mb, Gb, Tb
+        if (overflow || size > MI_MAX_ALLOC_SIZE) { size = (MI_MAX_ALLOC_SIZE / MI_KiB); }
+        value = (size > LONG_MAX ? LONG_MAX : (long)size);
       }
       if (*end == 0) {
         desc->value = value;
         desc->init = INITIALIZED;
       }
       else {
-        _mi_warning_message("environment option mimalloc_%s has an invalid value: %s\n", desc->name, buf);
+        // set `init` first to avoid recursion through _mi_warning_message on mimalloc_verbose.
         desc->init = DEFAULTED;
+        if (desc->option == mi_option_verbose && desc->value == 0) {
+          // if the 'mimalloc_verbose' env var has a bogus value we'd never know
+          // (since the value defaults to 'off') so in that case briefly enable verbose
+          desc->value = 1;
+          _mi_warning_message("environment option mimalloc_%s has an invalid value.\n", desc->name);
+          desc->value = 0;
+        }
+        else {
+          _mi_warning_message("environment option mimalloc_%s has an invalid value.\n", desc->name);
+        }
       }
     }
     mi_assert_internal(desc->init != UNINIT);
diff --git a/contrib/libs/mimalloc/src/os.c b/contrib/libs/mimalloc/src/os.c
index 85415232d7b5..88e7fcb32e12 100644
--- a/contrib/libs/mimalloc/src/os.c
+++ b/contrib/libs/mimalloc/src/os.c
@@ -1,552 +1,225 @@
 /* ----------------------------------------------------------------------------
-Copyright (c) 2018-2021, Microsoft Research, Daan Leijen
+Copyright (c) 2018-2023, Microsoft Research, Daan Leijen
 This is free software; you can redistribute it and/or modify it under the
 terms of the MIT license. A copy of the license can be found in the file
 "LICENSE" at the root of this distribution.
 -----------------------------------------------------------------------------*/
-#ifndef _DEFAULT_SOURCE
-#define _DEFAULT_SOURCE   // ensure mmap flags are defined
-#endif
-
-#if defined(__sun)
-// illumos provides new mman.h api when any of these are defined
-// otherwise the old api based on caddr_t which predates the void pointers one.
-// stock solaris provides only the former, chose to atomically to discard those
-// flags only here rather than project wide tough.
-#undef _XOPEN_SOURCE
-#undef _POSIX_C_SOURCE
-#endif
 #include "mimalloc.h"
-#include "mimalloc-internal.h"
-#include "mimalloc-atomic.h"
-
-#include <string.h>  // strerror
-
-#ifdef _MSC_VER
-#pragma warning(disable:4996)  // strerror
-#endif
-
+#include "mimalloc/internal.h"
+#include "mimalloc/atomic.h"
+#include "mimalloc/prim.h"
 
-#if defined(_WIN32)
-#include <windows.h>
-#elif defined(__wasi__)
-// stdlib.h is all we need, and has already been included in mimalloc.h
-#else
-#include <sys/mman.h>  // mmap
-#include <unistd.h>    // sysconf
-#if defined(__linux__)
-#include <features.h>
-#if defined(__GLIBC__)
-#include <linux/mman.h> // linux mmap flags
-#else
-#include <sys/mman.h>
-#endif
-#endif
-#if defined(__APPLE__)
-#include <TargetConditionals.h>
-#if !TARGET_IOS_IPHONE && !TARGET_IOS_SIMULATOR
-#include <mach/vm_statistics.h>
-#endif
-#endif
-#if defined(__HAIKU__)
-#define madvise posix_madvise
-#define MADV_DONTNEED POSIX_MADV_DONTNEED
-#endif
-#endif
 
 /* -----------------------------------------------------------
-  Initialization.
-  On windows initializes support for aligned allocation and
-  large OS pages (if MIMALLOC_LARGE_OS_PAGES is true).
+  Initialization. 
 ----------------------------------------------------------- */
-bool    _mi_os_decommit(void* addr, size_t size, mi_stats_t* stats);
 
-static void* mi_align_up_ptr(void* p, size_t alignment) {
-  return (void*)_mi_align_up((uintptr_t)p, alignment);
-}
+static mi_os_mem_config_t mi_os_mem_config = {
+  4096,   // page size
+  0,      // large page size (usually 2MiB)
+  4096,   // allocation granularity
+  true,   // has overcommit?  (if true we use MAP_NORESERVE on mmap systems)
+  false,  // can we partially free allocated blocks? (on mmap systems we can free anywhere in a mapped range, but on Windows we must free the entire span)
+  true    // has virtual reserve? (if true we can reserve virtual address space without using commit or physical memory)
+};
 
-static inline uintptr_t _mi_align_down(uintptr_t sz, size_t alignment) {
-  mi_assert_internal(alignment != 0);
-  uintptr_t mask = alignment - 1;
-  if ((alignment & mask) == 0) { // power of two?
-    return (sz & ~mask);
-  }
-  else {
-    return ((sz / alignment) * alignment);
-  }
+bool _mi_os_has_overcommit(void) {
+  return mi_os_mem_config.has_overcommit;
 }
 
-static void* mi_align_down_ptr(void* p, size_t alignment) {
-  return (void*)_mi_align_down((uintptr_t)p, alignment);
+bool _mi_os_has_virtual_reserve(void) {
+  return mi_os_mem_config.has_virtual_reserve;
 }
 
-// page size (initialized properly in `os_init`)
-static size_t os_page_size = 4096;
-
-// minimal allocation granularity
-static size_t os_alloc_granularity = 4096;
-
-// if non-zero, use large page allocation
-static size_t large_os_page_size = 0;
 
 // OS (small) page size
-size_t _mi_os_page_size() {
-  return os_page_size;
+size_t _mi_os_page_size(void) {
+  return mi_os_mem_config.page_size;
 }
 
 // if large OS pages are supported (2 or 4MiB), then return the size, otherwise return the small page size (4KiB)
-size_t _mi_os_large_page_size() {
-  return (large_os_page_size != 0 ? large_os_page_size : _mi_os_page_size());
+size_t _mi_os_large_page_size(void) {
+  return (mi_os_mem_config.large_page_size != 0 ? mi_os_mem_config.large_page_size : _mi_os_page_size());
 }
 
-static bool use_large_os_page(size_t size, size_t alignment) {
+bool _mi_os_use_large_page(size_t size, size_t alignment) {
   // if we have access, check the size and alignment requirements
-  if (large_os_page_size == 0 || !mi_option_is_enabled(mi_option_large_os_pages)) return false;
-  return ((size % large_os_page_size) == 0 && (alignment % large_os_page_size) == 0);
+  if (mi_os_mem_config.large_page_size == 0 || !mi_option_is_enabled(mi_option_allow_large_os_pages)) return false;
+  return ((size % mi_os_mem_config.large_page_size) == 0 && (alignment % mi_os_mem_config.large_page_size) == 0);
 }
 
 // round to a good OS allocation size (bounded by max 12.5% waste)
 size_t _mi_os_good_alloc_size(size_t size) {
   size_t align_size;
-  if (size < 512*KiB) align_size = _mi_os_page_size();
-  else if (size < 2*MiB) align_size = 64*KiB;
-  else if (size < 8*MiB) align_size = 256*KiB;
-  else if (size < 32*MiB) align_size = 1*MiB;
-  else align_size = 4*MiB;
-  if (mi_unlikely(size >= (SIZE_MAX - align_size))) return size; // possible overflow?
+  if (size < 512*MI_KiB) align_size = _mi_os_page_size();
+  else if (size < 2*MI_MiB) align_size = 64*MI_KiB;
+  else if (size < 8*MI_MiB) align_size = 256*MI_KiB;
+  else if (size < 32*MI_MiB) align_size = 1*MI_MiB;
+  else align_size = 4*MI_MiB;
+  if mi_unlikely(size >= (SIZE_MAX - align_size)) return size; // possible overflow?
   return _mi_align_up(size, align_size);
 }
 
-#if defined(_WIN32)
-// We use VirtualAlloc2 for aligned allocation, but it is only supported on Windows 10 and Windows Server 2016.
-// So, we need to look it up dynamically to run on older systems. (use __stdcall for 32-bit compatibility)
-// NtAllocateVirtualAllocEx is used for huge OS page allocation (1GiB)
-//
-// We hide MEM_EXTENDED_PARAMETER to compile with older SDK's.
-#include <winternl.h>
-typedef PVOID    (__stdcall *PVirtualAlloc2)(HANDLE, PVOID, SIZE_T, ULONG, ULONG, /* MEM_EXTENDED_PARAMETER* */ void*, ULONG);
-typedef NTSTATUS (__stdcall *PNtAllocateVirtualMemoryEx)(HANDLE, PVOID*, SIZE_T*, ULONG, ULONG, /* MEM_EXTENDED_PARAMETER* */ PVOID, ULONG);
-static PVirtualAlloc2 pVirtualAlloc2 = NULL;
-static PNtAllocateVirtualMemoryEx pNtAllocateVirtualMemoryEx = NULL;
-
-// Similarly, GetNumaProcesorNodeEx is only supported since Windows 7
-#if (_WIN32_WINNT < 0x601)  // before Win7
-typedef struct _PROCESSOR_NUMBER { WORD Group; BYTE Number; BYTE Reserved; } PROCESSOR_NUMBER, *PPROCESSOR_NUMBER;
-#endif
-typedef VOID (__stdcall *PGetCurrentProcessorNumberEx)(PPROCESSOR_NUMBER ProcNumber);
-typedef BOOL (__stdcall *PGetNumaProcessorNodeEx)(PPROCESSOR_NUMBER Processor, PUSHORT NodeNumber);
-typedef BOOL (__stdcall* PGetNumaNodeProcessorMaskEx)(USHORT Node, PGROUP_AFFINITY ProcessorMask);
-static PGetCurrentProcessorNumberEx pGetCurrentProcessorNumberEx = NULL;
-static PGetNumaProcessorNodeEx      pGetNumaProcessorNodeEx = NULL;
-static PGetNumaNodeProcessorMaskEx  pGetNumaNodeProcessorMaskEx = NULL;
-
-static bool mi_win_enable_large_os_pages()
-{
-  if (large_os_page_size > 0) return true;
-
-  // Try to see if large OS pages are supported
-  // To use large pages on Windows, we first need access permission
-  // Set "Lock pages in memory" permission in the group policy editor
-  // <https://devblogs.microsoft.com/oldnewthing/20110128-00/?p=11643>
-  unsigned long err = 0;
-  HANDLE token = NULL;
-  BOOL ok = OpenProcessToken(GetCurrentProcess(), TOKEN_ADJUST_PRIVILEGES | TOKEN_QUERY, &token);
-  if (ok) {
-    TOKEN_PRIVILEGES tp;
-    ok = LookupPrivilegeValue(NULL, TEXT("SeLockMemoryPrivilege"), &tp.Privileges[0].Luid);
-    if (ok) {
-      tp.PrivilegeCount = 1;
-      tp.Privileges[0].Attributes = SE_PRIVILEGE_ENABLED;
-      ok = AdjustTokenPrivileges(token, FALSE, &tp, 0, (PTOKEN_PRIVILEGES)NULL, 0);
-      if (ok) {
-        err = GetLastError();
-        ok = (err == ERROR_SUCCESS);
-        if (ok) {
-          large_os_page_size = GetLargePageMinimum();
-        }
-      }
-    }
-    CloseHandle(token);
-  }
-  if (!ok) {
-    if (err == 0) err = GetLastError();
-    _mi_warning_message("cannot enable large OS page support, error %lu\n", err);
-  }
-  return (ok!=0);
-}
-
 void _mi_os_init(void) {
-  // get the page size
-  SYSTEM_INFO si;
-  GetSystemInfo(&si);
-  if (si.dwPageSize > 0) os_page_size = si.dwPageSize;
-  if (si.dwAllocationGranularity > 0) os_alloc_granularity = si.dwAllocationGranularity;
-  // get the VirtualAlloc2 function
-  HINSTANCE  hDll;
-  hDll = LoadLibrary(TEXT("kernelbase.dll"));
-  if (hDll != NULL) {
-    // use VirtualAlloc2FromApp if possible as it is available to Windows store apps
-    pVirtualAlloc2 = (PVirtualAlloc2)(void (*)(void))GetProcAddress(hDll, "VirtualAlloc2FromApp");
-    if (pVirtualAlloc2==NULL) pVirtualAlloc2 = (PVirtualAlloc2)(void (*)(void))GetProcAddress(hDll, "VirtualAlloc2");
-    FreeLibrary(hDll);
-  }
-  // NtAllocateVirtualMemoryEx is used for huge page allocation
-  hDll = LoadLibrary(TEXT("ntdll.dll"));
-  if (hDll != NULL) {
-    pNtAllocateVirtualMemoryEx = (PNtAllocateVirtualMemoryEx)(void (*)(void))GetProcAddress(hDll, "NtAllocateVirtualMemoryEx");
-    FreeLibrary(hDll);
-  }
-  // Try to use Win7+ numa API
-  hDll = LoadLibrary(TEXT("kernel32.dll"));
-  if (hDll != NULL) {
-    pGetCurrentProcessorNumberEx = (PGetCurrentProcessorNumberEx)(void (*)(void))GetProcAddress(hDll, "GetCurrentProcessorNumberEx");
-    pGetNumaProcessorNodeEx = (PGetNumaProcessorNodeEx)(void (*)(void))GetProcAddress(hDll, "GetNumaProcessorNodeEx");
-    pGetNumaNodeProcessorMaskEx = (PGetNumaNodeProcessorMaskEx)(void (*)(void))GetProcAddress(hDll, "GetNumaNodeProcessorMaskEx");
-    FreeLibrary(hDll);
-  }
-  if (mi_option_is_enabled(mi_option_large_os_pages) || mi_option_is_enabled(mi_option_reserve_huge_os_pages)) {
-    mi_win_enable_large_os_pages();
-  }
-}
-#elif defined(__wasi__)
-void _mi_os_init() {
-  os_page_size = 0x10000; // WebAssembly has a fixed page size: 64KB
-  os_alloc_granularity = 16;
-}
-#else
-void _mi_os_init() {
-  // get the page size
-  long result = sysconf(_SC_PAGESIZE);
-  if (result > 0) {
-    os_page_size = (size_t)result;
-    os_alloc_granularity = os_page_size;
-  }
-  large_os_page_size = 2*MiB; // TODO: can we query the OS for this?
+  _mi_prim_mem_init(&mi_os_mem_config);
 }
-#endif
 
 
 /* -----------------------------------------------------------
-  Raw allocation on Windows (VirtualAlloc) and Unix's (mmap).
------------------------------------------------------------ */
+  Util
+-------------------------------------------------------------- */
+bool _mi_os_decommit(void* addr, size_t size, mi_stats_t* stats);
+bool _mi_os_commit(void* addr, size_t size, bool* is_zero, mi_stats_t* tld_stats);
 
-static bool mi_os_mem_free(void* addr, size_t size, bool was_committed, mi_stats_t* stats)
-{
-  if (addr == NULL || size == 0) return true; // || _mi_os_is_huge_reserved(addr)
-  bool err = false;
-#if defined(_WIN32)
-  err = (VirtualFree(addr, 0, MEM_RELEASE) == 0);
-#elif defined(__wasi__)
-  err = 0; // WebAssembly's heap cannot be shrunk
-#else
-  err = (munmap(addr, size) == -1);
-#endif
-  if (was_committed) _mi_stat_decrease(&stats->committed, size);
-  _mi_stat_decrease(&stats->reserved, size);
-  if (err) {
-    _mi_warning_message("munmap failed: %s, addr 0x%8li, size %lu\n", strerror(errno), (size_t)addr, size);
-    return false;
+static inline uintptr_t _mi_align_down(uintptr_t sz, size_t alignment) {
+  mi_assert_internal(alignment != 0);
+  uintptr_t mask = alignment - 1;
+  if ((alignment & mask) == 0) { // power of two?
+    return (sz & ~mask);
   }
   else {
-    return true;
-  }
-}
-
-static void* mi_os_get_aligned_hint(size_t try_alignment, size_t size);
-
-#ifdef _WIN32
-static void* mi_win_virtual_allocx(void* addr, size_t size, size_t try_alignment, DWORD flags) {
-#if (MI_INTPTR_SIZE >= 8)
-  // on 64-bit systems, try to use the virtual address area after 4TiB for 4MiB aligned allocations
-  void* hint;
-  if (addr == NULL && (hint = mi_os_get_aligned_hint(try_alignment,size)) != NULL) {
-    void* p = VirtualAlloc(hint, size, flags, PAGE_READWRITE);
-    if (p != NULL) return p;
-    DWORD err = GetLastError();
-    if (err != ERROR_INVALID_ADDRESS &&   // If linked with multiple instances, we may have tried to allocate at an already allocated area (#210)
-        err != ERROR_INVALID_PARAMETER) { // Windows7 instability (#230)
-      return NULL;
-    }
-    // fall through
-  } 
-#endif
-#if defined(MEM_EXTENDED_PARAMETER_TYPE_BITS)
-  // on modern Windows try use VirtualAlloc2 for aligned allocation
-  if (try_alignment > 0 && (try_alignment % _mi_os_page_size()) == 0 && pVirtualAlloc2 != NULL) {
-    MEM_ADDRESS_REQUIREMENTS reqs = { 0, 0, 0 };
-    reqs.Alignment = try_alignment;
-    MEM_EXTENDED_PARAMETER param = { {0, 0}, {0} };
-    param.Type = MemExtendedParameterAddressRequirements;
-    param.Pointer = &reqs;
-    return (*pVirtualAlloc2)(GetCurrentProcess(), addr, size, flags, PAGE_READWRITE, &param, 1);
+    return ((sz / alignment) * alignment);
   }
-#endif
-  // last resort
-  return VirtualAlloc(addr, size, flags, PAGE_READWRITE);
 }
 
-static void* mi_win_virtual_alloc(void* addr, size_t size, size_t try_alignment, DWORD flags, bool large_only, bool allow_large, bool* is_large) {
-  mi_assert_internal(!(large_only && !allow_large));
-  static _Atomic(uintptr_t) large_page_try_ok; // = 0;
-  void* p = NULL;
-  if ((large_only || use_large_os_page(size, try_alignment))
-      && allow_large && (flags&MEM_COMMIT)!=0 && (flags&MEM_RESERVE)!=0) {
-    uintptr_t try_ok = mi_atomic_load_acquire(&large_page_try_ok);
-    if (!large_only && try_ok > 0) {
-      // if a large page allocation fails, it seems the calls to VirtualAlloc get very expensive.
-      // therefore, once a large page allocation failed, we don't try again for `large_page_try_ok` times.
-      mi_atomic_cas_strong_acq_rel(&large_page_try_ok, &try_ok, try_ok - 1);
-    }
-    else {
-      // large OS pages must always reserve and commit.
-      *is_large = true;
-      p = mi_win_virtual_allocx(addr, size, try_alignment, flags | MEM_LARGE_PAGES);
-      if (large_only) return p;
-      // fall back to non-large page allocation on error (`p == NULL`).
-      if (p == NULL) {
-        mi_atomic_store_release(&large_page_try_ok,10UL);  // on error, don't try again for the next N allocations
-      }
-    }
-  }
-  if (p == NULL) {
-    *is_large = ((flags&MEM_LARGE_PAGES) != 0);
-    p = mi_win_virtual_allocx(addr, size, try_alignment, flags);
-  }
-  if (p == NULL) {
-    _mi_warning_message("unable to allocate OS memory (%zu bytes, error code: %i, address: %p, large only: %d, allow large: %d)\n", size, GetLastError(), addr, large_only, allow_large);
-  }
-  return p;
+static void* mi_align_down_ptr(void* p, size_t alignment) {
+  return (void*)_mi_align_down((uintptr_t)p, alignment);
 }
 
-#elif defined(__wasi__)
-static void* mi_wasm_heap_grow(size_t size, size_t try_alignment) {
-  uintptr_t base = __builtin_wasm_memory_size(0) * _mi_os_page_size();
-  uintptr_t aligned_base = _mi_align_up(base, (uintptr_t) try_alignment);
-  size_t alloc_size = _mi_align_up( aligned_base - base + size, _mi_os_page_size());
-  mi_assert(alloc_size >= size && (alloc_size % _mi_os_page_size()) == 0);
-  if (alloc_size < size) return NULL;
-  if (__builtin_wasm_memory_grow(0, alloc_size / _mi_os_page_size()) == SIZE_MAX) {
-    errno = ENOMEM;
-    return NULL;
-  }
-  return (void*)aligned_base;
-}
-#else
-#define MI_OS_USE_MMAP
-static void* mi_unix_mmapx(void* addr, size_t size, size_t try_alignment, int protect_flags, int flags, int fd) {
-  void* p = NULL;
-  #if (MI_INTPTR_SIZE >= 8) && !defined(MAP_ALIGNED)
-  // on 64-bit systems, use the virtual address area after 4TiB for 4MiB aligned allocations
-  void* hint;
-  if (addr == NULL && (hint = mi_os_get_aligned_hint(try_alignment, size)) != NULL) {
-    p = mmap(hint,size,protect_flags,flags,fd,0);
-    if (p==MAP_FAILED) p = NULL; // fall back to regular mmap
-  }
-  #else
-  UNUSED(try_alignment);
-  UNUSED(mi_os_get_aligned_hint);
-  #endif
-  if (p==NULL) {
-    p = mmap(addr,size,protect_flags,flags,fd,0);
-    if (p==MAP_FAILED) p = NULL;
-  }
-  return p;
-}
 
-static void* mi_unix_mmap(void* addr, size_t size, size_t try_alignment, int protect_flags, bool large_only, bool allow_large, bool* is_large) {
-  void* p = NULL;
-  #if !defined(MAP_ANONYMOUS)
-  #define MAP_ANONYMOUS  MAP_ANON
-  #endif
-  #if !defined(MAP_NORESERVE)
-  #define MAP_NORESERVE  0
-  #endif
-  int flags = MAP_PRIVATE | MAP_ANONYMOUS | MAP_NORESERVE;
-  int fd = -1;
-  #if defined(MAP_ALIGNED)  // BSD
-  if (try_alignment > 0) {
-    size_t n = mi_bsr(try_alignment);
-    if (((size_t)1 << n) == try_alignment && n >= 12 && n <= 30) {  // alignment is a power of 2 and 4096 <= alignment <= 1GiB
-      flags |= MAP_ALIGNED(n);
-    }
-  }
-  #endif
-  #if defined(PROT_MAX)
-  protect_flags |= PROT_MAX(PROT_READ | PROT_WRITE); // BSD
-  #endif
-  #if defined(VM_MAKE_TAG)
-  // macOS: tracking anonymous page with a specific ID. (All up to 98 are taken officially but LLVM sanitizers had taken 99)
-  int os_tag = (int)mi_option_get(mi_option_os_tag);
-  if (os_tag < 100 || os_tag > 255) os_tag = 100;
-  fd = VM_MAKE_TAG(os_tag);
-  #endif
-  if ((large_only || use_large_os_page(size, try_alignment)) && allow_large) {
-    static _Atomic(uintptr_t) large_page_try_ok; // = 0;
-    uintptr_t try_ok = mi_atomic_load_acquire(&large_page_try_ok);
-    if (!large_only && try_ok > 0) {
-      // If the OS is not configured for large OS pages, or the user does not have
-      // enough permission, the `mmap` will always fail (but it might also fail for other reasons).
-      // Therefore, once a large page allocation failed, we don't try again for `large_page_try_ok` times
-      // to avoid too many failing calls to mmap.
-      mi_atomic_cas_strong_acq_rel(&large_page_try_ok, &try_ok, try_ok - 1);
-    }
-    else {
-      int lflags = flags & ~MAP_NORESERVE;  // using NORESERVE on huge pages seems to fail on Linux
-      int lfd = fd;
-      #ifdef MAP_ALIGNED_SUPER
-      lflags |= MAP_ALIGNED_SUPER;
-      #endif
-      #ifdef MAP_HUGETLB
-      lflags |= MAP_HUGETLB;
-      #endif
-      #ifdef MAP_HUGE_1GB
-      static bool mi_huge_pages_available = true;
-      if ((size % GiB) == 0 && mi_huge_pages_available) {
-        lflags |= MAP_HUGE_1GB;
-      }
-      else
-      #endif
-      {
-        #ifdef MAP_HUGE_2MB
-        lflags |= MAP_HUGE_2MB;
-        #endif
-      }
-      #ifdef VM_FLAGS_SUPERPAGE_SIZE_2MB
-      lfd |= VM_FLAGS_SUPERPAGE_SIZE_2MB;
-      #endif
-      if (large_only || lflags != flags) {
-        // try large OS page allocation
-        *is_large = true;
-        p = mi_unix_mmapx(addr, size, try_alignment, protect_flags, lflags, lfd);
-        #ifdef MAP_HUGE_1GB
-        if (p == NULL && (lflags & MAP_HUGE_1GB) != 0) {
-          mi_huge_pages_available = false; // don't try huge 1GiB pages again
-          _mi_warning_message("unable to allocate huge (1GiB) page, trying large (2MiB) pages instead (error %i)\n", errno);
-          lflags = ((lflags & ~MAP_HUGE_1GB) | MAP_HUGE_2MB);
-          p = mi_unix_mmapx(addr, size, try_alignment, protect_flags, lflags, lfd);
-        }
-        #endif
-        if (large_only) return p;
-        if (p == NULL) {
-          mi_atomic_store_release(&large_page_try_ok, (uintptr_t)10);  // on error, don't try again for the next N allocations
-        }
-      }
-    }
-  }
-  if (p == NULL) {
-    *is_large = false;
-    p = mi_unix_mmapx(addr, size, try_alignment, protect_flags, flags, fd);
-    #if defined(MADV_HUGEPAGE)
-    // Many Linux systems don't allow MAP_HUGETLB but they support instead
-    // transparent huge pages (THP). It is not required to call `madvise` with MADV_HUGE
-    // though since properly aligned allocations will already use large pages if available
-    // in that case -- in particular for our large regions (in `memory.c`).
-    // However, some systems only allow THP if called with explicit `madvise`, so
-    // when large OS pages are enabled for mimalloc, we call `madvise` anyways.
-    if (allow_large && use_large_os_page(size, try_alignment)) {
-      if (madvise(p, size, MADV_HUGEPAGE) == 0) {
-        *is_large = true; // possibly
-      };
-    }
-    #endif
-    #if defined(__sun)
-    if (allow_large && use_large_os_page(size, try_alignment)) {
-      struct memcntl_mha cmd = {0};
-      cmd.mha_pagesize = large_os_page_size;
-      cmd.mha_cmd = MHA_MAPSIZE_VA;
-      if (memcntl(p, size, MC_HAT_ADVISE, (caddr_t)&cmd, 0, 0) == 0) {
-        *is_large = true;
-      }
-    }
-    #endif
-  }
-  if (p == NULL) {
-    _mi_warning_message("unable to allocate OS memory (%zu bytes, error code: %i, address: %p, large only: %d, allow large: %d)\n", size, errno, addr, large_only, allow_large);
-  }
-  return p;
-}
-#endif
+/* -----------------------------------------------------------
+  aligned hinting
+-------------------------------------------------------------- */
 
 // On 64-bit systems, we can do efficient aligned allocation by using
-// the 4TiB to 30TiB area to allocate them.
-#if (MI_INTPTR_SIZE >= 8) && (defined(_WIN32) || (defined(MI_OS_USE_MMAP) && !defined(MAP_ALIGNED)))
-static mi_decl_cache_align _Atomic(uintptr_t) aligned_base;
+// the 2TiB to 30TiB area to allocate those.
+#if (MI_INTPTR_SIZE >= 8)
+static mi_decl_cache_align _Atomic(uintptr_t)aligned_base;
 
-// Return a 4MiB aligned address that is probably available.
-// If this returns NULL, the OS will determine the address but on some OS's that may not be 
+// Return a MI_SEGMENT_SIZE aligned address that is probably available.
+// If this returns NULL, the OS will determine the address but on some OS's that may not be
 // properly aligned which can be more costly as it needs to be adjusted afterwards.
-// For a size > 1GiB this always returns NULL in order to guarantee good ASLR randomization; 
-// (otherwise an initial large allocation of say 2TiB has a 50% chance to include (known) addresses 
+// For a size > 1GiB this always returns NULL in order to guarantee good ASLR randomization;
+// (otherwise an initial large allocation of say 2TiB has a 50% chance to include (known) addresses
 //  in the middle of the 2TiB - 6TiB address range (see issue #372))
 
-#define KK_HINT_BASE ((uintptr_t)2 << 40)  // 2TiB start
-#define KK_HINT_AREA ((uintptr_t)4 << 40)  // upto 6TiB   (since before win8 there is "only" 8TiB available to processes)
-#define KK_HINT_MAX  ((uintptr_t)30 << 40) // wrap after 30TiB (area after 32TiB is used for huge OS pages)
+#define MI_HINT_BASE ((uintptr_t)2 << 40)  // 2TiB start
+#define MI_HINT_AREA ((uintptr_t)4 << 40)  // upto 6TiB   (since before win8 there is "only" 8TiB available to processes)
+#define MI_HINT_MAX  ((uintptr_t)30 << 40) // wrap after 30TiB (area after 32TiB is used for huge OS pages)
 
-static void* mi_os_get_aligned_hint(size_t try_alignment, size_t size) 
+void* _mi_os_get_aligned_hint(size_t try_alignment, size_t size)
 {
-  if (try_alignment == 0 || try_alignment > MI_SEGMENT_SIZE) return NULL;
-  if ((size%MI_SEGMENT_SIZE) != 0) return NULL;
-  if (size > 1*GiB) return NULL;  // guarantee the chance of fixed valid address is at most 1/(KK_HINT_AREA / 1<<30) = 1/4096.
+  if (try_alignment <= 1 || try_alignment > MI_SEGMENT_SIZE) return NULL;
+  size = _mi_align_up(size, MI_SEGMENT_SIZE);
+  if (size > 1*MI_GiB) return NULL;  // guarantee the chance of fixed valid address is at most 1/(MI_HINT_AREA / 1<<30) = 1/4096.
   #if (MI_SECURE>0)
   size += MI_SEGMENT_SIZE;        // put in `MI_SEGMENT_SIZE` virtual gaps between hinted blocks; this splits VLA's but increases guarded areas.
   #endif
 
   uintptr_t hint = mi_atomic_add_acq_rel(&aligned_base, size);
-  if (hint == 0 || hint > KK_HINT_MAX) {   // wrap or initialize
-    uintptr_t init = KK_HINT_BASE;
+  if (hint == 0 || hint > MI_HINT_MAX) {   // wrap or initialize
+    uintptr_t init = MI_HINT_BASE;
     #if (MI_SECURE>0 || MI_DEBUG==0)       // security: randomize start of aligned allocations unless in debug mode
-    uintptr_t r = _mi_heap_random_next(mi_get_default_heap());
-    init = init + ((MI_SEGMENT_SIZE * ((r>>17) & 0xFFFFF)) % KK_HINT_AREA);  // (randomly 20 bits)*4MiB == 0 to 4TiB
+    uintptr_t r = _mi_heap_random_next(mi_prim_get_default_heap());
+    init = init + ((MI_SEGMENT_SIZE * ((r>>17) & 0xFFFFF)) % MI_HINT_AREA);  // (randomly 20 bits)*4MiB == 0 to 4TiB
     #endif
     uintptr_t expected = hint + size;
     mi_atomic_cas_strong_acq_rel(&aligned_base, &expected, init);
-    hint = mi_atomic_add_acq_rel(&aligned_base, size); // this may still give 0 or > KK_HINT_MAX but that is ok, it is a hint after all
+    hint = mi_atomic_add_acq_rel(&aligned_base, size); // this may still give 0 or > MI_HINT_MAX but that is ok, it is a hint after all
   }
   if (hint%try_alignment != 0) return NULL;
   return (void*)hint;
 }
 #else
-static void* mi_os_get_aligned_hint(size_t try_alignment, size_t size) {
-  UNUSED(try_alignment); UNUSED(size);
+void* _mi_os_get_aligned_hint(size_t try_alignment, size_t size) {
+  MI_UNUSED(try_alignment); MI_UNUSED(size);
   return NULL;
 }
 #endif
 
 
-// Primitive allocation from the OS.
+/* -----------------------------------------------------------
+  Free memory
+-------------------------------------------------------------- */
+
+static void mi_os_free_huge_os_pages(void* p, size_t size, mi_stats_t* stats);
+
+static void mi_os_prim_free(void* addr, size_t size, bool still_committed, mi_stats_t* tld_stats) {
+  MI_UNUSED(tld_stats);
+  mi_stats_t* stats = &_mi_stats_main;
+  mi_assert_internal((size % _mi_os_page_size()) == 0);
+  if (addr == NULL || size == 0) return; // || _mi_os_is_huge_reserved(addr)
+  int err = _mi_prim_free(addr, size);
+  if (err != 0) {
+    _mi_warning_message("unable to free OS memory (error: %d (0x%x), size: 0x%zx bytes, address: %p)\n", err, err, size, addr);
+  }
+  if (still_committed) { _mi_stat_decrease(&stats->committed, size); }
+  _mi_stat_decrease(&stats->reserved, size);
+}
+
+void _mi_os_free_ex(void* addr, size_t size, bool still_committed, mi_memid_t memid, mi_stats_t* tld_stats) {
+  if (mi_memkind_is_os(memid.memkind)) {
+    size_t csize = _mi_os_good_alloc_size(size);
+    void* base = addr;
+    // different base? (due to alignment)
+    if (memid.mem.os.base != NULL) {
+      mi_assert(memid.mem.os.base <= addr);
+      mi_assert((uint8_t*)memid.mem.os.base + memid.mem.os.alignment >= (uint8_t*)addr);
+      base = memid.mem.os.base;
+      csize += ((uint8_t*)addr - (uint8_t*)memid.mem.os.base);
+    }
+    // free it
+    if (memid.memkind == MI_MEM_OS_HUGE) {
+      mi_assert(memid.is_pinned);
+      mi_os_free_huge_os_pages(base, csize, tld_stats);
+    }
+    else {
+      mi_os_prim_free(base, csize, still_committed, tld_stats);
+    }
+  }
+  else {
+    // nothing to do
+    mi_assert(memid.memkind < MI_MEM_OS);
+  }
+}
+
+void  _mi_os_free(void* p, size_t size, mi_memid_t memid, mi_stats_t* tld_stats) {
+  _mi_os_free_ex(p, size, true, memid, tld_stats);
+}
+
+
+/* -----------------------------------------------------------
+   Primitive allocation from the OS.
+-------------------------------------------------------------- */
+
 // Note: the `try_alignment` is just a hint and the returned pointer is not guaranteed to be aligned.
-static void* mi_os_mem_alloc(size_t size, size_t try_alignment, bool commit, bool allow_large, bool* is_large, mi_stats_t* stats) {
+static void* mi_os_prim_alloc(size_t size, size_t try_alignment, bool commit, bool allow_large, bool* is_large, bool* is_zero, mi_stats_t* tld_stats) {
   mi_assert_internal(size > 0 && (size % _mi_os_page_size()) == 0);
+  mi_assert_internal(is_zero != NULL);
+  mi_assert_internal(is_large != NULL);
   if (size == 0) return NULL;
-  if (!commit) allow_large = false;
-
+  if (!commit) { allow_large = false; }
+  if (try_alignment == 0) { try_alignment = 1; } // avoid 0 to ensure there will be no divide by zero when aligning
+  *is_zero = false;
   void* p = NULL;
-  /*
-  if (commit && allow_large) {
-    p = _mi_os_try_alloc_from_huge_reserved(size, try_alignment);
-    if (p != NULL) {
-      *is_large = true;
-      return p;
-    }
+  int err = _mi_prim_alloc(size, try_alignment, commit, allow_large, is_large, is_zero, &p);
+  if (err != 0) {
+    _mi_warning_message("unable to allocate OS memory (error: %d (0x%x), size: 0x%zx bytes, align: 0x%zx, commit: %d, allow large: %d)\n", err, err, size, try_alignment, commit, allow_large);
   }
-  */
 
-  #if defined(_WIN32)
-    int flags = MEM_RESERVE;
-    if (commit) flags |= MEM_COMMIT;
-    p = mi_win_virtual_alloc(NULL, size, try_alignment, flags, false, allow_large, is_large);
-  #elif defined(__wasi__)
-    *is_large = false;
-    p = mi_wasm_heap_grow(size, try_alignment);
-  #else
-    int protect_flags = (commit ? (PROT_WRITE | PROT_READ) : PROT_NONE);
-    p = mi_unix_mmap(NULL, size, try_alignment, protect_flags, false, allow_large, is_large);
-  #endif
+  MI_UNUSED(tld_stats);
+  mi_stats_t* stats = &_mi_stats_main;
   mi_stat_counter_increase(stats->mmap_calls, 1);
   if (p != NULL) {
     _mi_stat_increase(&stats->reserved, size);
-    if (commit) { _mi_stat_increase(&stats->committed, size); }
+    if (commit) {
+      _mi_stat_increase(&stats->committed, size);
+      // seems needed for asan (or `mimalloc-test-api` fails)
+      #ifdef MI_TRACK_ASAN
+      if (*is_zero) { mi_track_mem_defined(p,size); }
+               else { mi_track_mem_undefined(p,size); }
+      #endif
+    }
   }
   return p;
 }
@@ -554,119 +227,147 @@ static void* mi_os_mem_alloc(size_t size, size_t try_alignment, bool commit, boo
 
 // Primitive aligned allocation from the OS.
 // This function guarantees the allocated memory is aligned.
-static void* mi_os_mem_alloc_aligned(size_t size, size_t alignment, bool commit, bool allow_large, bool* is_large, mi_stats_t* stats) {
+static void* mi_os_prim_alloc_aligned(size_t size, size_t alignment, bool commit, bool allow_large, bool* is_large, bool* is_zero, void** base, mi_stats_t* stats) {
   mi_assert_internal(alignment >= _mi_os_page_size() && ((alignment & (alignment - 1)) == 0));
   mi_assert_internal(size > 0 && (size % _mi_os_page_size()) == 0);
+  mi_assert_internal(is_large != NULL);
+  mi_assert_internal(is_zero != NULL);
+  mi_assert_internal(base != NULL);
   if (!commit) allow_large = false;
   if (!(alignment >= _mi_os_page_size() && ((alignment & (alignment - 1)) == 0))) return NULL;
   size = _mi_align_up(size, _mi_os_page_size());
 
   // try first with a hint (this will be aligned directly on Win 10+ or BSD)
-  void* p = mi_os_mem_alloc(size, alignment, commit, allow_large, is_large, stats);
+  void* p = mi_os_prim_alloc(size, alignment, commit, allow_large, is_large, is_zero, stats);
   if (p == NULL) return NULL;
 
-  // if not aligned, free it, overallocate, and unmap around it
-  if (((uintptr_t)p % alignment != 0)) {
-    mi_os_mem_free(p, size, commit, stats);
+  // aligned already?
+  if (((uintptr_t)p % alignment) == 0) {
+    *base = p;
+  }
+  else {
+    // if not aligned, free it, overallocate, and unmap around it
+    _mi_warning_message("unable to allocate aligned OS memory directly, fall back to over-allocation (size: 0x%zx bytes, address: %p, alignment: 0x%zx, commit: %d)\n", size, p, alignment, commit);
+    mi_os_prim_free(p, size, commit, stats);
     if (size >= (SIZE_MAX - alignment)) return NULL; // overflow
-    size_t over_size = size + alignment;
-
-#if _WIN32
-    // over-allocate and than re-allocate exactly at an aligned address in there.
-    // this may fail due to threads allocating at the same time so we
-    // retry this at most 3 times before giving up.
-    // (we can not decommit around the overallocation on Windows, because we can only
-    //  free the original pointer, not one pointing inside the area)
-    int flags = MEM_RESERVE;
-    if (commit) flags |= MEM_COMMIT;
-    for (int tries = 0; tries < 3; tries++) {
-      // over-allocate to determine a virtual memory range
-      p = mi_os_mem_alloc(over_size, alignment, commit, false, is_large, stats);
-      if (p == NULL) return NULL; // error
-      if (((uintptr_t)p % alignment) == 0) {
-        // if p happens to be aligned, just decommit the left-over area
-        _mi_os_decommit((uint8_t*)p + size, over_size - size, stats);
-        break;
-      }
-      else {
-        // otherwise free and allocate at an aligned address in there
-        mi_os_mem_free(p, over_size, commit, stats);
-        void* aligned_p = mi_align_up_ptr(p, alignment);
-        p = mi_win_virtual_alloc(aligned_p, size, alignment, flags, false, allow_large, is_large);
-        if (p == aligned_p) break; // success!
-        if (p != NULL) { // should not happen?
-          mi_os_mem_free(p, size, commit, stats);
-          p = NULL;
-        }
+    const size_t over_size = size + alignment;
+
+    if (!mi_os_mem_config.has_partial_free) {  // win32 virtualAlloc cannot free parts of an allocated block
+      // over-allocate uncommitted (virtual) memory
+      p = mi_os_prim_alloc(over_size, 1 /*alignment*/, false /* commit? */, false /* allow_large */, is_large, is_zero, stats);
+      if (p == NULL) return NULL;
+
+      // set p to the aligned part in the full region
+      // note: this is dangerous on Windows as VirtualFree needs the actual base pointer
+      // this is handled though by having the `base` field in the memid's
+      *base = p; // remember the base
+      p = mi_align_up_ptr(p, alignment);
+
+      // explicitly commit only the aligned part
+      if (commit) {
+        _mi_os_commit(p, size, NULL, stats);
       }
     }
-#else
-    // overallocate...
-    p = mi_os_mem_alloc(over_size, alignment, commit, false, is_large, stats);
-    if (p == NULL) return NULL;
-    // and selectively unmap parts around the over-allocated area.
-    void* aligned_p = mi_align_up_ptr(p, alignment);
-    size_t pre_size = (uint8_t*)aligned_p - (uint8_t*)p;
-    size_t mid_size = _mi_align_up(size, _mi_os_page_size());
-    size_t post_size = over_size - pre_size - mid_size;
-    mi_assert_internal(pre_size < over_size && post_size < over_size && mid_size >= size);
-    if (pre_size > 0)  mi_os_mem_free(p, pre_size, commit, stats);
-    if (post_size > 0) mi_os_mem_free((uint8_t*)aligned_p + mid_size, post_size, commit, stats);
-    // we can return the aligned pointer on `mmap` systems
-    p = aligned_p;
-#endif
+    else  { // mmap can free inside an allocation
+      // overallocate...
+      p = mi_os_prim_alloc(over_size, 1, commit, false, is_large, is_zero, stats);
+      if (p == NULL) return NULL;
+
+      // and selectively unmap parts around the over-allocated area. 
+      void* aligned_p = mi_align_up_ptr(p, alignment);
+      size_t pre_size = (uint8_t*)aligned_p - (uint8_t*)p;
+      size_t mid_size = _mi_align_up(size, _mi_os_page_size());
+      size_t post_size = over_size - pre_size - mid_size;
+      mi_assert_internal(pre_size < over_size&& post_size < over_size&& mid_size >= size);
+      if (pre_size > 0)  { mi_os_prim_free(p, pre_size, commit, stats); }
+      if (post_size > 0) { mi_os_prim_free((uint8_t*)aligned_p + mid_size, post_size, commit, stats); }
+      // we can return the aligned pointer on `mmap` systems
+      p = aligned_p;
+      *base = aligned_p; // since we freed the pre part, `*base == p`.
+    }
   }
 
-  mi_assert_internal(p == NULL || (p != NULL && ((uintptr_t)p % alignment) == 0));
+  mi_assert_internal(p == NULL || (p != NULL && *base != NULL && ((uintptr_t)p % alignment) == 0));
   return p;
 }
 
+
 /* -----------------------------------------------------------
-  OS API: alloc, free, alloc_aligned
+  OS API: alloc and alloc_aligned
 ----------------------------------------------------------- */
 
-void* _mi_os_alloc(size_t size, mi_stats_t* tld_stats) {
-  UNUSED(tld_stats);
-  mi_stats_t* stats = &_mi_stats_main;
+void* _mi_os_alloc(size_t size, mi_memid_t* memid, mi_stats_t* stats) {
+  *memid = _mi_memid_none();
   if (size == 0) return NULL;
   size = _mi_os_good_alloc_size(size);
-  bool is_large = false;
-  return mi_os_mem_alloc(size, 0, true, false, &is_large, stats);
-}
-
-void  _mi_os_free_ex(void* p, size_t size, bool was_committed, mi_stats_t* tld_stats) {
-  UNUSED(tld_stats);
-  mi_stats_t* stats = &_mi_stats_main;
-  if (size == 0 || p == NULL) return;
-  size = _mi_os_good_alloc_size(size);
-  mi_os_mem_free(p, size, was_committed, stats);
-}
-
-void  _mi_os_free(void* p, size_t size, mi_stats_t* stats) {
-  _mi_os_free_ex(p, size, true, stats);
+  bool os_is_large = false;
+  bool os_is_zero  = false;
+  void* p = mi_os_prim_alloc(size, 0, true, false, &os_is_large, &os_is_zero, stats);
+  if (p != NULL) {
+    *memid = _mi_memid_create_os(true, os_is_zero, os_is_large);
+  }
+  return p;
 }
 
-void* _mi_os_alloc_aligned(size_t size, size_t alignment, bool commit, bool* large, mi_stats_t* tld_stats)
+void* _mi_os_alloc_aligned(size_t size, size_t alignment, bool commit, bool allow_large, mi_memid_t* memid, mi_stats_t* stats)
 {
-  UNUSED(tld_stats);
+  MI_UNUSED(&_mi_os_get_aligned_hint); // suppress unused warnings
+  *memid = _mi_memid_none();
   if (size == 0) return NULL;
   size = _mi_os_good_alloc_size(size);
   alignment = _mi_align_up(alignment, _mi_os_page_size());
-  bool allow_large = false;
-  if (large != NULL) {
-    allow_large = *large;
-    *large = false;
+
+  bool os_is_large = false;
+  bool os_is_zero  = false;
+  void* os_base = NULL;
+  void* p = mi_os_prim_alloc_aligned(size, alignment, commit, allow_large, &os_is_large, &os_is_zero, &os_base, stats );
+  if (p != NULL) {
+    *memid = _mi_memid_create_os(commit, os_is_zero, os_is_large);
+    memid->mem.os.base = os_base;
+    memid->mem.os.alignment = alignment;
   }
-  return mi_os_mem_alloc_aligned(size, alignment, commit, allow_large, (large!=NULL?large:&allow_large), &_mi_stats_main /*tld->stats*/ );
+  return p;
 }
 
+/* -----------------------------------------------------------
+  OS aligned allocation with an offset. This is used
+  for large alignments > MI_BLOCK_ALIGNMENT_MAX. We use a large mimalloc
+  page where the object can be aligned at an offset from the start of the segment.
+  As we may need to overallocate, we need to free such pointers using `mi_free_aligned`
+  to use the actual start of the memory region.
+----------------------------------------------------------- */
 
+void* _mi_os_alloc_aligned_at_offset(size_t size, size_t alignment, size_t offset, bool commit, bool allow_large, mi_memid_t* memid, mi_stats_t* stats) {
+  mi_assert(offset <= MI_SEGMENT_SIZE);
+  mi_assert(offset <= size);
+  mi_assert((alignment % _mi_os_page_size()) == 0);
+  *memid = _mi_memid_none();
+  if (offset > MI_SEGMENT_SIZE) return NULL;
+  if (offset == 0) {
+    // regular aligned allocation
+    return _mi_os_alloc_aligned(size, alignment, commit, allow_large, memid, stats);
+  }
+  else {
+    // overallocate to align at an offset
+    const size_t extra = _mi_align_up(offset, alignment) - offset;
+    const size_t oversize = size + extra;
+    void* const start = _mi_os_alloc_aligned(oversize, alignment, commit, allow_large, memid, stats);
+    if (start == NULL) return NULL;
+
+    void* const p = (uint8_t*)start + extra;
+    mi_assert(_mi_is_aligned((uint8_t*)p + offset, alignment));
+    // decommit the overallocation at the start
+    if (commit && extra > _mi_os_page_size()) {
+      _mi_os_decommit(start, extra, stats);
+    }
+    return p;
+  }
+}
 
 /* -----------------------------------------------------------
   OS memory API: reset, commit, decommit, protect, unprotect.
 ----------------------------------------------------------- */
 
-
 // OS page align within a given area, either conservative (pages inside the area only),
 // or not (straddling pages outside the area is possible)
 static void* mi_os_page_align_areax(bool conservative, void* addr, size_t size, size_t* newsize) {
@@ -691,176 +392,115 @@ static void* mi_os_page_align_area_conservative(void* addr, size_t size, size_t*
   return mi_os_page_align_areax(true, addr, size, newsize);
 }
 
-static void mi_mprotect_hint(int err) {
-#if defined(MI_OS_USE_MMAP) && (MI_SECURE>=2) // guard page around every mimalloc page
-  if (err == ENOMEM) {
-    _mi_warning_message("the previous warning may have been caused by a low memory map limit.\n"
-                        "  On Linux this is controlled by the vm.max_map_count. For example:\n"
-                        "  > sudo sysctl -w vm.max_map_count=262144\n");
-  }
-#else
-  UNUSED(err);
-#endif
-}
-
-// Commit/Decommit memory.
-// Usually commit is aligned liberal, while decommit is aligned conservative.
-// (but not for the reset version where we want commit to be conservative as well)
-static bool mi_os_commitx(void* addr, size_t size, bool commit, bool conservative, bool* is_zero, mi_stats_t* stats) {
-  // page align in the range, commit liberally, decommit conservative
+bool _mi_os_commit(void* addr, size_t size, bool* is_zero, mi_stats_t* tld_stats) {
+  MI_UNUSED(tld_stats);
+  mi_stats_t* stats = &_mi_stats_main;
   if (is_zero != NULL) { *is_zero = false; }
+  _mi_stat_increase(&stats->committed, size);  // use size for precise commit vs. decommit
+  _mi_stat_counter_increase(&stats->commit_calls, 1);
+
+  // page align range
   size_t csize;
-  void* start = mi_os_page_align_areax(conservative, addr, size, &csize);
-  if (csize == 0) return true;  // || _mi_os_is_huge_reserved(addr))
-  int err = 0;
-  if (commit) {
-    _mi_stat_increase(&stats->committed, size);  // use size for precise commit vs. decommit
-    _mi_stat_counter_increase(&stats->commit_calls, 1);
-  }
-  else {
-    _mi_stat_decrease(&stats->committed, size);
-  }
+  void* start = mi_os_page_align_areax(false /* conservative? */, addr, size, &csize);
+  if (csize == 0) return true;
 
-  #if defined(_WIN32)
-  if (commit) {
-    // if the memory was already committed, the call succeeds but it is not zero'd
-    // *is_zero = true;
-    void* p = VirtualAlloc(start, csize, MEM_COMMIT, PAGE_READWRITE);
-    err = (p == start ? 0 : GetLastError());
-  }
-  else {
-    BOOL ok = VirtualFree(start, csize, MEM_DECOMMIT);
-    err = (ok ? 0 : GetLastError());
-  }
-  #elif defined(__wasi__)
-  // WebAssembly guests can't control memory protection
-  #elif defined(MAP_FIXED)
-  if (!commit) {
-    // use mmap with MAP_FIXED to discard the existing memory (and reduce commit charge)
-    void* p = mmap(start, csize, PROT_NONE, (MAP_FIXED | MAP_PRIVATE | MAP_ANONYMOUS | MAP_NORESERVE), -1, 0);
-    if (p != start) { err = errno; }
+  // commit
+  bool os_is_zero = false;
+  int err = _mi_prim_commit(start, csize, &os_is_zero);
+  if (err != 0) {
+    _mi_warning_message("cannot commit OS memory (error: %d (0x%x), address: %p, size: 0x%zx bytes)\n", err, err, start, csize);
+    return false;
   }
-  else {
-    // for commit, just change the protection
-    err = mprotect(start, csize, (PROT_READ | PROT_WRITE));
-    if (err != 0) { err = errno; }
-    #if defined(MADV_FREE_REUSE)
-      while ((err = madvise(start, csize, MADV_FREE_REUSE)) != 0 && errno == EAGAIN) { errno = 0; }
-    #endif
+  if (os_is_zero && is_zero != NULL) {
+    *is_zero = true;
+    mi_assert_expensive(mi_mem_is_zero(start, csize));
   }
-  #else
-  err = mprotect(start, csize, (commit ? (PROT_READ | PROT_WRITE) : PROT_NONE));
-  if (err != 0) { err = errno; }
+  // note: the following seems required for asan (otherwise `mimalloc-test-stress` fails)
+  #ifdef MI_TRACK_ASAN
+  if (os_is_zero) { mi_track_mem_defined(start,csize); }
+             else { mi_track_mem_undefined(start,csize); }
   #endif
+  return true;
+}
+
+static bool mi_os_decommit_ex(void* addr, size_t size, bool* needs_recommit, mi_stats_t* tld_stats) {
+  MI_UNUSED(tld_stats);
+  mi_stats_t* stats = &_mi_stats_main;
+  mi_assert_internal(needs_recommit!=NULL);
+  _mi_stat_decrease(&stats->committed, size);
+
+  // page align
+  size_t csize;
+  void* start = mi_os_page_align_area_conservative(addr, size, &csize);
+  if (csize == 0) return true;
+
+  // decommit
+  *needs_recommit = true;
+  int err = _mi_prim_decommit(start,csize,needs_recommit);
   if (err != 0) {
-    _mi_warning_message("%s error: start: %p, csize: 0x%x, err: %i\n", commit ? "commit" : "decommit", start, csize, err);
-    mi_mprotect_hint(err);
+    _mi_warning_message("cannot decommit OS memory (error: %d (0x%x), address: %p, size: 0x%zx bytes)\n", err, err, start, csize);
   }
   mi_assert_internal(err == 0);
   return (err == 0);
 }
 
-bool _mi_os_commit(void* addr, size_t size, bool* is_zero, mi_stats_t* tld_stats) {
-  UNUSED(tld_stats);
-  mi_stats_t* stats = &_mi_stats_main;
-  return mi_os_commitx(addr, size, true, false /* liberal */, is_zero, stats);
-}
-
 bool _mi_os_decommit(void* addr, size_t size, mi_stats_t* tld_stats) {
-  UNUSED(tld_stats);
-  mi_stats_t* stats = &_mi_stats_main;
-  bool is_zero;
-  return mi_os_commitx(addr, size, false, true /* conservative */, &is_zero, stats);
+  bool needs_recommit;
+  return mi_os_decommit_ex(addr, size, &needs_recommit, tld_stats);
 }
 
-static bool mi_os_commit_unreset(void* addr, size_t size, bool* is_zero, mi_stats_t* stats) {  
-  return mi_os_commitx(addr, size, true, true /* conservative */, is_zero, stats);
-}
 
 // Signal to the OS that the address range is no longer in use
 // but may be used later again. This will release physical memory
 // pages and reduce swapping while keeping the memory committed.
 // We page align to a conservative area inside the range to reset.
-static bool mi_os_resetx(void* addr, size_t size, bool reset, mi_stats_t* stats) {
+bool _mi_os_reset(void* addr, size_t size, mi_stats_t* stats) {
   // page align conservatively within the range
   size_t csize;
   void* start = mi_os_page_align_area_conservative(addr, size, &csize);
   if (csize == 0) return true;  // || _mi_os_is_huge_reserved(addr)
-  if (reset) _mi_stat_increase(&stats->reset, csize);
-        else _mi_stat_decrease(&stats->reset, csize);
-  if (!reset) return true; // nothing to do on unreset!
+  _mi_stat_increase(&stats->reset, csize);
+  _mi_stat_counter_increase(&stats->reset_calls, 1);
 
-  #if (MI_DEBUG>1)
-  if (MI_SECURE==0) {
-    memset(start, 0, csize); // pretend it is eagerly reset
-  }
+  #if (MI_DEBUG>1) && !MI_SECURE && !MI_TRACK_ENABLED // && !MI_TSAN
+  memset(start, 0, csize); // pretend it is eagerly reset
   #endif
 
-#if defined(_WIN32)
-  // Testing shows that for us (on `malloc-large`) MEM_RESET is 2x faster than DiscardVirtualMemory
-  void* p = VirtualAlloc(start, csize, MEM_RESET, PAGE_READWRITE);
-  mi_assert_internal(p == start);
-  #if 1
-  if (p == start && start != NULL) {
-    VirtualUnlock(start,csize); // VirtualUnlock after MEM_RESET removes the memory from the working set
-  }
-  #endif
-  if (p != start) return false;
-#else
-#if defined(MADV_FREE)
-  #if defined(MADV_FREE_REUSABLE)
-    #define KK_MADV_FREE_INITIAL  MADV_FREE_REUSABLE
-  #else
-    #define KK_MADV_FREE_INITIAL  MADV_FREE
-  #endif
-  static _Atomic(uintptr_t) advice = ATOMIC_VAR_INIT(KK_MADV_FREE_INITIAL);
-  int oadvice = (int)mi_atomic_load_relaxed(&advice);
-  int err;
-  while ((err = madvise(start, csize, oadvice)) != 0 && errno == EAGAIN) { errno = 0;  };
-  if (err != 0 && errno == EINVAL && oadvice == KK_MADV_FREE_INITIAL) {  
-    // if MADV_FREE/MADV_FREE_REUSABLE is not supported, fall back to MADV_DONTNEED from now on
-    mi_atomic_store_release(&advice, (uintptr_t)MADV_DONTNEED);
-    err = madvise(start, csize, MADV_DONTNEED);
-  }
-#elif defined(__wasi__)
-  int err = 0;
-#else
-  int err = madvise(start, csize, MADV_DONTNEED);
-#endif
+  int err = _mi_prim_reset(start, csize);
   if (err != 0) {
-    _mi_warning_message("madvise reset error: start: %p, csize: 0x%x, errno: %i\n", start, csize, errno);
+    _mi_warning_message("cannot reset OS memory (error: %d (0x%x), address: %p, size: 0x%zx bytes)\n", err, err, start, csize);
   }
-  //mi_assert(err == 0);
-  if (err != 0) return false;
-#endif
-  return true;
+  return (err == 0);
 }
 
-// Signal to the OS that the address range is no longer in use
-// but may be used later again. This will release physical memory
-// pages and reduce swapping while keeping the memory committed.
-// We page align to a conservative area inside the range to reset.
-bool _mi_os_reset(void* addr, size_t size, mi_stats_t* tld_stats) {
-  UNUSED(tld_stats);
-  mi_stats_t* stats = &_mi_stats_main;
-  if (mi_option_is_enabled(mi_option_reset_decommits)) {
-    return _mi_os_decommit(addr, size, stats);
+
+// either resets or decommits memory, returns true if the memory needs
+// to be recommitted if it is to be re-used later on.
+bool _mi_os_purge_ex(void* p, size_t size, bool allow_reset, mi_stats_t* stats)
+{
+  if (mi_option_get(mi_option_purge_delay) < 0) return false;  // is purging allowed?
+  _mi_stat_counter_increase(&stats->purge_calls, 1);
+  _mi_stat_increase(&stats->purged, size);
+
+  if (mi_option_is_enabled(mi_option_purge_decommits) &&   // should decommit?
+    !_mi_preloading())                                     // don't decommit during preloading (unsafe)
+  {
+    bool needs_recommit = true;
+    mi_os_decommit_ex(p, size, &needs_recommit, stats);
+    return needs_recommit;
   }
   else {
-    return mi_os_resetx(addr, size, true, stats);
+    if (allow_reset) {  // this can sometimes be not allowed if the range is not fully committed
+      _mi_os_reset(p, size, stats);
+    }
+    return false;  // needs no recommit
   }
 }
 
-bool _mi_os_unreset(void* addr, size_t size, bool* is_zero, mi_stats_t* tld_stats) {
-  UNUSED(tld_stats);
-  mi_stats_t* stats = &_mi_stats_main;
-  if (mi_option_is_enabled(mi_option_reset_decommits)) {
-    return mi_os_commit_unreset(addr, size, is_zero, stats);  // re-commit it (conservatively!)
-  }
-  else {
-    *is_zero = false;
-    return mi_os_resetx(addr, size, false, stats);
-  }
+// either resets or decommits memory, returns true if the memory needs
+// to be recommitted if it is to be re-used later on.
+bool _mi_os_purge(void* p, size_t size, mi_stats_t * stats) {
+  return _mi_os_purge_ex(p, size, true, stats);
 }
 
 
@@ -875,20 +515,9 @@ static  bool mi_os_protectx(void* addr, size_t size, bool protect) {
 	  _mi_warning_message("cannot mprotect memory allocated in huge OS pages\n");
   }
   */
-  int err = 0;
-#ifdef _WIN32
-  DWORD oldprotect = 0;
-  BOOL ok = VirtualProtect(start, csize, protect ? PAGE_NOACCESS : PAGE_READWRITE, &oldprotect);
-  err = (ok ? 0 : GetLastError());
-#elif defined(__wasi__)
-  err = 0;
-#else
-  err = mprotect(start, csize, protect ? PROT_NONE : (PROT_READ | PROT_WRITE));
-  if (err != 0) { err = errno; }
-#endif
+  int err = _mi_prim_protect(start,csize,protect);
   if (err != 0) {
-    _mi_warning_message("mprotect error: start: %p, csize: 0x%x, err: %i\n", start, csize, err);
-    mi_mprotect_hint(err);
+    _mi_warning_message("cannot %s OS memory (error: %d (0x%x), address: %p, size: 0x%zx bytes)\n", (protect ? "protect" : "unprotect"), err, err, start, csize);
   }
   return (err == 0);
 }
@@ -903,121 +532,12 @@ bool _mi_os_unprotect(void* addr, size_t size) {
 
 
 
-bool _mi_os_shrink(void* p, size_t oldsize, size_t newsize, mi_stats_t* stats) {
-  // page align conservatively within the range
-  mi_assert_internal(oldsize > newsize && p != NULL);
-  if (oldsize < newsize || p == NULL) return false;
-  if (oldsize == newsize) return true;
-
-  // oldsize and newsize should be page aligned or we cannot shrink precisely
-  void* addr = (uint8_t*)p + newsize;
-  size_t size = 0;
-  void* start = mi_os_page_align_area_conservative(addr, oldsize - newsize, &size);
-  if (size == 0 || start != addr) return false;
-
-#ifdef _WIN32
-  // we cannot shrink on windows, but we can decommit
-  return _mi_os_decommit(start, size, stats);
-#else
-  return mi_os_mem_free(start, size, true, stats);
-#endif
-}
-
-
 /* ----------------------------------------------------------------------------
 Support for allocating huge OS pages (1Gib) that are reserved up-front
 and possibly associated with a specific NUMA node. (use `numa_node>=0`)
 -----------------------------------------------------------------------------*/
-#define MI_HUGE_OS_PAGE_SIZE  (GiB)
+#define MI_HUGE_OS_PAGE_SIZE  (MI_GiB)
 
-#if defined(_WIN32) && (MI_INTPTR_SIZE >= 8)
-static void* mi_os_alloc_huge_os_pagesx(void* addr, size_t size, int numa_node)
-{
-  mi_assert_internal(size%GiB == 0);
-  mi_assert_internal(addr != NULL);
-  const DWORD flags = MEM_LARGE_PAGES | MEM_COMMIT | MEM_RESERVE;
-
-  mi_win_enable_large_os_pages();
-
-  #if defined(MEM_EXTENDED_PARAMETER_TYPE_BITS)
-  MEM_EXTENDED_PARAMETER params[3] = { {{0,0},{0}},{{0,0},{0}},{{0,0},{0}} };
-  // on modern Windows try use NtAllocateVirtualMemoryEx for 1GiB huge pages
-  static bool mi_huge_pages_available = true;
-  if (pNtAllocateVirtualMemoryEx != NULL && mi_huge_pages_available) {
-    #ifndef MEM_EXTENDED_PARAMETER_NONPAGED_HUGE
-    #define MEM_EXTENDED_PARAMETER_NONPAGED_HUGE  (0x10)
-    #endif
-    params[0].Type = 5; // == MemExtendedParameterAttributeFlags;
-    params[0].ULong64 = MEM_EXTENDED_PARAMETER_NONPAGED_HUGE;
-    ULONG param_count = 1;
-    if (numa_node >= 0) {
-      param_count++;
-      params[1].Type = MemExtendedParameterNumaNode;
-      params[1].ULong = (unsigned)numa_node;
-    }
-    SIZE_T psize = size;
-    void* base = addr;
-    NTSTATUS err = (*pNtAllocateVirtualMemoryEx)(GetCurrentProcess(), &base, &psize, flags, PAGE_READWRITE, params, param_count);
-    if (err == 0 && base != NULL) {
-      return base;
-    }
-    else {
-      // fall back to regular large pages
-      mi_huge_pages_available = false; // don't try further huge pages
-      _mi_warning_message("unable to allocate using huge (1gb) pages, trying large (2mb) pages instead (status 0x%lx)\n", err);
-    }
-  }
-  // on modern Windows try use VirtualAlloc2 for numa aware large OS page allocation
-  if (pVirtualAlloc2 != NULL && numa_node >= 0) {
-    params[0].Type = MemExtendedParameterNumaNode;
-    params[0].ULong = (unsigned)numa_node;
-    return (*pVirtualAlloc2)(GetCurrentProcess(), addr, size, flags, PAGE_READWRITE, params, 1);
-  }
-  #else
-    UNUSED(numa_node);
-  #endif
-  // otherwise use regular virtual alloc on older windows
-  return VirtualAlloc(addr, size, flags, PAGE_READWRITE);
-}
-
-#elif defined(MI_OS_USE_MMAP) && (MI_INTPTR_SIZE >= 8) && !defined(__HAIKU__)
-#include <sys/syscall.h>
-#ifndef MPOL_PREFERRED
-#define MPOL_PREFERRED 1
-#endif
-#if defined(SYS_mbind)
-static long mi_os_mbind(void* start, unsigned long len, unsigned long mode, const unsigned long* nmask, unsigned long maxnode, unsigned flags) {
-  return syscall(SYS_mbind, start, len, mode, nmask, maxnode, flags);
-}
-#else
-static long mi_os_mbind(void* start, unsigned long len, unsigned long mode, const unsigned long* nmask, unsigned long maxnode, unsigned flags) {
-  UNUSED(start); UNUSED(len); UNUSED(mode); UNUSED(nmask); UNUSED(maxnode); UNUSED(flags);
-  return 0;
-}
-#endif
-static void* mi_os_alloc_huge_os_pagesx(void* addr, size_t size, int numa_node) {
-  mi_assert_internal(size%GiB == 0);
-  bool is_large = true;
-  void* p = mi_unix_mmap(addr, size, MI_SEGMENT_SIZE, PROT_READ | PROT_WRITE, true, true, &is_large);
-  if (p == NULL) return NULL;
-  if (numa_node >= 0 && numa_node < 8*MI_INTPTR_SIZE) { // at most 64 nodes
-    uintptr_t numa_mask = (1UL << numa_node);
-    // TODO: does `mbind` work correctly for huge OS pages? should we
-    // use `set_mempolicy` before calling mmap instead?
-    // see: <https://lkml.org/lkml/2017/2/9/875>
-    long err = mi_os_mbind(p, size, MPOL_PREFERRED, &numa_mask, 8*MI_INTPTR_SIZE, 0);
-    if (err != 0) {
-      _mi_warning_message("failed to bind huge (1gb) pages to numa node %d: %s\n", numa_node, strerror(errno));
-    }
-  }
-  return p;
-}
-#else
-static void* mi_os_alloc_huge_os_pagesx(void* addr, size_t size, int numa_node) {
-  UNUSED(addr); UNUSED(size); UNUSED(numa_node);
-  return NULL;
-}
-#endif
 
 #if (MI_INTPTR_SIZE >= 8)
 // To ensure proper alignment, use our own area for huge OS pages
@@ -1036,10 +556,10 @@ static uint8_t* mi_os_claim_huge_pages(size_t pages, size_t* total_size) {
     if (start == 0) {
       // Initialize the start address after the 32TiB area
       start = ((uintptr_t)32 << 40);  // 32TiB virtual start address
-#if (MI_SECURE>0 || MI_DEBUG==0)      // security: randomize start of huge pages unless in debug mode
-      uintptr_t r = _mi_heap_random_next(mi_get_default_heap());
+    #if (MI_SECURE>0 || MI_DEBUG==0)      // security: randomize start of huge pages unless in debug mode
+      uintptr_t r = _mi_heap_random_next(mi_prim_get_default_heap());
       start = start + ((uintptr_t)MI_HUGE_OS_PAGE_SIZE * ((r>>17) & 0x0FFF));  // (randomly 12bits)*1GiB == between 0 to 4TiB
-#endif
+    #endif
     }
     end = start + size;
     mi_assert_internal(end % MI_SEGMENT_SIZE == 0);
@@ -1050,14 +570,15 @@ static uint8_t* mi_os_claim_huge_pages(size_t pages, size_t* total_size) {
 }
 #else
 static uint8_t* mi_os_claim_huge_pages(size_t pages, size_t* total_size) {
-  UNUSED(pages);
+  MI_UNUSED(pages);
   if (total_size != NULL) *total_size = 0;
   return NULL;
 }
 #endif
 
 // Allocate MI_SEGMENT_SIZE aligned huge pages
-void* _mi_os_alloc_huge_os_pages(size_t pages, int numa_node, mi_msecs_t max_msecs, size_t* pages_reserved, size_t* psize) {
+void* _mi_os_alloc_huge_os_pages(size_t pages, int numa_node, mi_msecs_t max_msecs, size_t* pages_reserved, size_t* psize, mi_memid_t* memid) {
+  *memid = _mi_memid_none();
   if (psize != NULL) *psize = 0;
   if (pages_reserved != NULL) *pages_reserved = 0;
   size_t size = 0;
@@ -1068,23 +589,32 @@ void* _mi_os_alloc_huge_os_pages(size_t pages, int numa_node, mi_msecs_t max_mse
   // We allocate one page at the time to be able to abort if it takes too long
   // or to at least allocate as many as available on the system.
   mi_msecs_t start_t = _mi_clock_start();
-  size_t page;
-  for (page = 0; page < pages; page++) {
+  size_t page = 0;
+  bool all_zero = true;
+  while (page < pages) {
     // allocate a page
+    bool is_zero = false;
     void* addr = start + (page * MI_HUGE_OS_PAGE_SIZE);
-    void* p = mi_os_alloc_huge_os_pagesx(addr, MI_HUGE_OS_PAGE_SIZE, numa_node);
+    void* p = NULL;
+    int err = _mi_prim_alloc_huge_os_pages(addr, MI_HUGE_OS_PAGE_SIZE, numa_node, &is_zero, &p);
+    if (!is_zero) { all_zero = false;  }
+    if (err != 0) {
+      _mi_warning_message("unable to allocate huge OS page (error: %d (0x%x), address: %p, size: %zx bytes)\n", err, err, addr, MI_HUGE_OS_PAGE_SIZE);
+      break;
+    }
 
     // Did we succeed at a contiguous address?
     if (p != addr) {
       // no success, issue a warning and break
       if (p != NULL) {
-        _mi_warning_message("could not allocate contiguous huge page %zu at %p\n", page, addr);
-        _mi_os_free(p, MI_HUGE_OS_PAGE_SIZE, &_mi_stats_main);
+        _mi_warning_message("could not allocate contiguous huge OS page %zu at %p\n", page, addr);
+        mi_os_prim_free(p, MI_HUGE_OS_PAGE_SIZE, true, &_mi_stats_main);
       }
       break;
     }
 
     // success, record it
+    page++;  // increase before timeout check (see issue #711)
     _mi_stat_increase(&_mi_stats_main.committed, MI_HUGE_OS_PAGE_SIZE);
     _mi_stat_increase(&_mi_stats_main.reserved, MI_HUGE_OS_PAGE_SIZE);
 
@@ -1098,109 +628,41 @@ void* _mi_os_alloc_huge_os_pages(size_t pages, int numa_node, mi_msecs_t max_mse
         }
       }
       if (elapsed > max_msecs) {
-        _mi_warning_message("huge page allocation timed out\n");
+        _mi_warning_message("huge OS page allocation timed out (after allocating %zu page(s))\n", page);
         break;
       }
     }
   }
   mi_assert_internal(page*MI_HUGE_OS_PAGE_SIZE <= size);
-  if (pages_reserved != NULL) *pages_reserved = page;
-  if (psize != NULL) *psize = page * MI_HUGE_OS_PAGE_SIZE;
+  if (pages_reserved != NULL) { *pages_reserved = page; }
+  if (psize != NULL) { *psize = page * MI_HUGE_OS_PAGE_SIZE; }
+  if (page != 0) {
+    mi_assert(start != NULL);
+    *memid = _mi_memid_create_os(true /* is committed */, all_zero, true /* is_large */);
+    memid->memkind = MI_MEM_OS_HUGE;
+    mi_assert(memid->is_pinned);
+    #ifdef MI_TRACK_ASAN
+    if (all_zero) { mi_track_mem_defined(start,size); }
+    #endif
+  }
   return (page == 0 ? NULL : start);
 }
 
 // free every huge page in a range individually (as we allocated per page)
 // note: needed with VirtualAlloc but could potentially be done in one go on mmap'd systems.
-void _mi_os_free_huge_pages(void* p, size_t size, mi_stats_t* stats) {
+static void mi_os_free_huge_os_pages(void* p, size_t size, mi_stats_t* stats) {
   if (p==NULL || size==0) return;
   uint8_t* base = (uint8_t*)p;
   while (size >= MI_HUGE_OS_PAGE_SIZE) {
-    _mi_os_free(base, MI_HUGE_OS_PAGE_SIZE, stats);
+    mi_os_prim_free(base, MI_HUGE_OS_PAGE_SIZE, true, stats);
     size -= MI_HUGE_OS_PAGE_SIZE;
+    base += MI_HUGE_OS_PAGE_SIZE;
   }
 }
 
 /* ----------------------------------------------------------------------------
 Support NUMA aware allocation
 -----------------------------------------------------------------------------*/
-#ifdef _WIN32  
-static size_t mi_os_numa_nodex() {
-  USHORT numa_node = 0;
-  if (pGetCurrentProcessorNumberEx != NULL && pGetNumaProcessorNodeEx != NULL) {
-    // Extended API is supported
-    PROCESSOR_NUMBER pnum;
-    (*pGetCurrentProcessorNumberEx)(&pnum);
-    USHORT nnode = 0;
-    BOOL ok = (*pGetNumaProcessorNodeEx)(&pnum, &nnode);
-    if (ok) numa_node = nnode;
-  }
-  else {
-    // Vista or earlier, use older API that is limited to 64 processors. Issue #277
-    DWORD pnum = GetCurrentProcessorNumber();
-    UCHAR nnode = 0;
-    BOOL ok = GetNumaProcessorNode((UCHAR)pnum, &nnode);
-    if (ok) numa_node = nnode;    
-  }
-  return numa_node;
-}
-
-static size_t mi_os_numa_node_countx(void) {
-  ULONG numa_max = 0;
-  GetNumaHighestNodeNumber(&numa_max);
-  // find the highest node number that has actual processors assigned to it. Issue #282
-  while(numa_max > 0) {
-    if (pGetNumaNodeProcessorMaskEx != NULL) {
-      // Extended API is supported
-      GROUP_AFFINITY affinity;
-      if ((*pGetNumaNodeProcessorMaskEx)((USHORT)numa_max, &affinity)) {
-        if (affinity.Mask != 0) break;  // found the maximum non-empty node
-      }
-    }
-    else {
-      // Vista or earlier, use older API that is limited to 64 processors.
-      ULONGLONG mask;
-      if (GetNumaNodeProcessorMask((UCHAR)numa_max, &mask)) {
-        if (mask != 0) break; // found the maximum non-empty node
-      };
-    }
-    // max node was invalid or had no processor assigned, try again
-    numa_max--;
-  }
-  return ((size_t)numa_max + 1);
-}
-#elif defined(__linux__)
-#include <sys/syscall.h>  // getcpu
-#include <stdio.h>        // access
-
-static size_t mi_os_numa_nodex(void) {
-#ifdef SYS_getcpu
-  unsigned long node = 0;
-  unsigned long ncpu = 0;
-  long err = syscall(SYS_getcpu, &ncpu, &node, NULL);
-  if (err != 0) return 0;
-  return node;
-#else
-  return 0;
-#endif
-}
-static size_t mi_os_numa_node_countx(void) {
-  char buf[128];
-  unsigned node = 0;
-  for(node = 0; node < 256; node++) {
-    // enumerate node entries -- todo: it there a more efficient way to do this? (but ensure there is no allocation)
-    snprintf(buf, 127, "/sys/devices/system/node/node%u", node + 1);
-    if (access(buf,R_OK) != 0) break;
-  }
-  return (node+1);
-}
-#else
-static size_t mi_os_numa_nodex(void) {
-  return 0;
-}
-static size_t mi_os_numa_node_countx(void) {
-  return 1;
-}
-#endif
 
 _Atomic(size_t)  _mi_numa_node_count; // = 0   // cache the node count
 
@@ -1212,9 +674,9 @@ size_t _mi_os_numa_node_count_get(void) {
       count = (size_t)ncount;
     }
     else {
-      count = mi_os_numa_node_countx(); // or detect dynamically
+      count = _mi_prim_numa_node_count(); // or detect dynamically
       if (count == 0) count = 1;
-    }    
+    }
     mi_atomic_store_release(&_mi_numa_node_count, count); // save it
     _mi_verbose_message("using %zd numa regions\n", count);
   }
@@ -1222,11 +684,11 @@ size_t _mi_os_numa_node_count_get(void) {
 }
 
 int _mi_os_numa_node_get(mi_os_tld_t* tld) {
-  UNUSED(tld);
+  MI_UNUSED(tld);
   size_t numa_count = _mi_os_numa_node_count();
   if (numa_count<=1) return 0; // optimize on single numa node systems: always node 0
   // never more than the node count and >= 0
-  size_t numa_node = mi_os_numa_nodex();
+  size_t numa_node = _mi_prim_numa_node();
   if (numa_node >= numa_count) { numa_node = numa_node % numa_count; }
   return (int)numa_node;
 }
diff --git a/contrib/libs/mimalloc/src/page-queue.c b/contrib/libs/mimalloc/src/page-queue.c
index 365257e7669c..02a8008d4a1a 100644
--- a/contrib/libs/mimalloc/src/page-queue.c
+++ b/contrib/libs/mimalloc/src/page-queue.c
@@ -1,5 +1,5 @@
 /*----------------------------------------------------------------------------
-Copyright (c) 2018-2020, Microsoft Research, Daan Leijen
+Copyright (c) 2018-2024, Microsoft Research, Daan Leijen
 This is free software; you can redistribute it and/or modify it under the
 terms of the MIT license. A copy of the license can be found in the file
 "LICENSE" at the root of this distribution.
@@ -11,6 +11,10 @@ terms of the MIT license. A copy of the license can be found in the file
 
 #ifndef MI_IN_PAGE_C
 #error "this file should be included from 'page.c'"
+// include to help an IDE
+#include "mimalloc.h"     
+#include "mimalloc/internal.h"
+#include "mimalloc/atomic.h"
 #endif
 
 /* -----------------------------------------------------------
@@ -53,7 +57,7 @@ static inline bool mi_page_queue_is_special(const mi_page_queue_t* pq) {
 // Returns MI_BIN_HUGE if the size is too large.
 // We use `wsize` for the size in "machine word sizes",
 // i.e. byte size == `wsize*sizeof(void*)`.
-extern inline uint8_t _mi_bin(size_t size) {
+static inline uint8_t mi_bin(size_t size) {
   size_t wsize = _mi_wsize_from_size(size);
   uint8_t bin;
   if (wsize <= 1) {
@@ -76,7 +80,7 @@ extern inline uint8_t _mi_bin(size_t size) {
     bin = MI_BIN_HUGE;
   }
   else {
-    #if defined(MI_ALIGN4W) 
+    #if defined(MI_ALIGN4W)
     if (wsize <= 16) { wsize = (wsize+3)&~3; } // round to 4x word sizes
     #endif
     wsize--;
@@ -98,6 +102,10 @@ extern inline uint8_t _mi_bin(size_t size) {
   Queue of pages with free blocks
 ----------------------------------------------------------- */
 
+uint8_t _mi_bin(size_t size) {
+  return mi_bin(size);
+}
+
 size_t _mi_bin_size(uint8_t bin) {
   return _mi_heap_empty.pages[bin].block_size;
 }
@@ -105,10 +113,10 @@ size_t _mi_bin_size(uint8_t bin) {
 // Good size for allocation
 size_t mi_good_size(size_t size) mi_attr_noexcept {
   if (size <= MI_LARGE_OBJ_SIZE_MAX) {
-    return _mi_bin_size(_mi_bin(size));
+    return _mi_bin_size(mi_bin(size + MI_PADDING_SIZE));
   }
   else {
-    return _mi_align_up(size,_mi_os_page_size());
+    return _mi_align_up(size + MI_PADDING_SIZE,_mi_os_page_size());
   }
 }
 
@@ -133,21 +141,21 @@ static bool mi_heap_contains_queue(const mi_heap_t* heap, const mi_page_queue_t*
 }
 #endif
 
-static mi_page_queue_t* mi_page_queue_of(const mi_page_t* page) {
-  uint8_t bin = (mi_page_is_in_full(page) ? MI_BIN_FULL : _mi_bin(page->xblock_size));
-  mi_heap_t* heap = mi_page_heap(page);
-  mi_assert_internal(heap != NULL && bin <= MI_BIN_FULL);
+static mi_page_queue_t* mi_heap_page_queue_of(mi_heap_t* heap, const mi_page_t* page) {
+  mi_assert_internal(heap!=NULL);
+  uint8_t bin = (mi_page_is_in_full(page) ? MI_BIN_FULL : (mi_page_is_huge(page) ? MI_BIN_HUGE : mi_bin(mi_page_block_size(page))));
+  mi_assert_internal(bin <= MI_BIN_FULL);
   mi_page_queue_t* pq = &heap->pages[bin];
-  mi_assert_internal(bin >= MI_BIN_HUGE || page->xblock_size == pq->block_size);
-  mi_assert_expensive(mi_page_queue_contains(pq, page));
+  mi_assert_internal((mi_page_block_size(page) == pq->block_size) ||
+                       (mi_page_is_huge(page) && mi_page_queue_is_huge(pq)) ||
+                         (mi_page_is_in_full(page) && mi_page_queue_is_full(pq)));
   return pq;
 }
 
-static mi_page_queue_t* mi_heap_page_queue_of(mi_heap_t* heap, const mi_page_t* page) {
-  uint8_t bin = (mi_page_is_in_full(page) ? MI_BIN_FULL : _mi_bin(page->xblock_size));
-  mi_assert_internal(bin <= MI_BIN_FULL);
-  mi_page_queue_t* pq = &heap->pages[bin];
-  mi_assert_internal(mi_page_is_in_full(page) || page->xblock_size == pq->block_size);
+static mi_page_queue_t* mi_page_queue_of(const mi_page_t* page) {
+  mi_heap_t* heap = mi_page_heap(page);
+  mi_page_queue_t* pq = mi_heap_page_queue_of(heap, page);
+  mi_assert_expensive(mi_page_queue_contains(pq, page));
   return pq;
 }
 
@@ -177,9 +185,9 @@ static inline void mi_heap_queue_first_update(mi_heap_t* heap, const mi_page_que
   }
   else {
     // find previous size; due to minimal alignment upto 3 previous bins may need to be skipped
-    uint8_t bin = _mi_bin(size);
+    uint8_t bin = mi_bin(size);
     const mi_page_queue_t* prev = pq - 1;
-    while( bin == _mi_bin(prev->block_size) && prev > &heap->pages[0]) {
+    while( bin == mi_bin(prev->block_size) && prev > &heap->pages[0]) {
       prev--;
     }
     start = 1 + _mi_wsize_from_size(prev->block_size);
@@ -202,7 +210,9 @@ static bool mi_page_queue_is_empty(mi_page_queue_t* queue) {
 static void mi_page_queue_remove(mi_page_queue_t* queue, mi_page_t* page) {
   mi_assert_internal(page != NULL);
   mi_assert_expensive(mi_page_queue_contains(queue, page));
-  mi_assert_internal(page->xblock_size == queue->block_size || (page->xblock_size > MI_LARGE_OBJ_SIZE_MAX && mi_page_queue_is_huge(queue))  || (mi_page_is_in_full(page) && mi_page_queue_is_full(queue)));
+  mi_assert_internal(mi_page_block_size(page) == queue->block_size || 
+                      (mi_page_is_huge(page) && mi_page_queue_is_huge(queue)) || 
+                        (mi_page_is_in_full(page) && mi_page_queue_is_full(queue)));
   mi_heap_t* heap = mi_page_heap(page);
   if (page->prev != NULL) page->prev->next = page->next;
   if (page->next != NULL) page->next->prev = page->prev;
@@ -224,9 +234,11 @@ static void mi_page_queue_remove(mi_page_queue_t* queue, mi_page_t* page) {
 static void mi_page_queue_push(mi_heap_t* heap, mi_page_queue_t* queue, mi_page_t* page) {
   mi_assert_internal(mi_page_heap(page) == heap);
   mi_assert_internal(!mi_page_queue_contains(queue, page));
+  #if MI_HUGE_PAGE_ABANDON
   mi_assert_internal(_mi_page_segment(page)->page_kind != MI_PAGE_HUGE);
-  mi_assert_internal(page->xblock_size == queue->block_size ||
-                      (page->xblock_size > MI_LARGE_OBJ_SIZE_MAX && mi_page_queue_is_huge(queue)) ||
+  #endif
+  mi_assert_internal(mi_page_block_size(page) == queue->block_size ||
+                      (mi_page_is_huge(page) && mi_page_queue_is_huge(queue)) ||
                         (mi_page_is_in_full(page) && mi_page_queue_is_full(queue)));
 
   mi_page_set_in_full(page, mi_page_queue_is_full(queue));
@@ -252,11 +264,13 @@ static void mi_page_queue_enqueue_from(mi_page_queue_t* to, mi_page_queue_t* fro
   mi_assert_internal(page != NULL);
   mi_assert_expensive(mi_page_queue_contains(from, page));
   mi_assert_expensive(!mi_page_queue_contains(to, page));
-  mi_assert_internal((page->xblock_size == to->block_size && page->xblock_size == from->block_size) ||
-                     (page->xblock_size == to->block_size && mi_page_queue_is_full(from)) ||
-                     (page->xblock_size == from->block_size && mi_page_queue_is_full(to)) ||
-                     (page->xblock_size > MI_LARGE_OBJ_SIZE_MAX && mi_page_queue_is_huge(to)) ||
-                     (page->xblock_size > MI_LARGE_OBJ_SIZE_MAX && mi_page_queue_is_full(to)));
+  const size_t bsize = mi_page_block_size(page);
+  MI_UNUSED(bsize);
+  mi_assert_internal((bsize == to->block_size && bsize == from->block_size) ||
+                     (bsize == to->block_size && mi_page_queue_is_full(from)) ||
+                     (bsize == from->block_size && mi_page_queue_is_full(to)) ||
+                     (mi_page_is_huge(page) && mi_page_queue_is_huge(to)) ||
+                     (mi_page_is_huge(page) && mi_page_queue_is_full(to)));
 
   mi_heap_t* heap = mi_page_heap(page);
   if (page->prev != NULL) page->prev->next = page->next;
@@ -297,7 +311,7 @@ size_t _mi_page_queue_append(mi_heap_t* heap, mi_page_queue_t* pq, mi_page_queue
   for (mi_page_t* page = append->first; page != NULL; page = page->next) {
     // inline `mi_page_set_heap` to avoid wrong assertion during absorption;
     // in this case it is ok to be delayed freeing since both "to" and "from" heap are still alive.
-    mi_atomic_store_release(&page->xheap, (uintptr_t)heap); 
+    mi_atomic_store_release(&page->xheap, (uintptr_t)heap);
     // set the flag to delayed free (not overriding NEVER_DELAYED_FREE) which has as a
     // side effect that it spins until any DELAYED_FREEING is finished. This ensures
     // that after appending only the new heap will be used for delayed free operations.
diff --git a/contrib/libs/mimalloc/src/page.c b/contrib/libs/mimalloc/src/page.c
index c08be9c00b7d..5a18b78027c4 100644
--- a/contrib/libs/mimalloc/src/page.c
+++ b/contrib/libs/mimalloc/src/page.c
@@ -1,5 +1,5 @@
 /*----------------------------------------------------------------------------
-Copyright (c) 2018-2020, Microsoft Research, Daan Leijen
+Copyright (c) 2018-2024, Microsoft Research, Daan Leijen
 This is free software; you can redistribute it and/or modify it under the
 terms of the MIT license. A copy of the license can be found in the file
 "LICENSE" at the root of this distribution.
@@ -7,13 +7,13 @@ terms of the MIT license. A copy of the license can be found in the file
 
 /* -----------------------------------------------------------
   The core of the allocator. Every segment contains
-  pages of a {certain block size. The main function
+  pages of a certain block size. The main function
   exported is `mi_malloc_generic`.
 ----------------------------------------------------------- */
 
 #include "mimalloc.h"
-#include "mimalloc-internal.h"
-#include "mimalloc-atomic.h"
+#include "mimalloc/internal.h"
+#include "mimalloc/atomic.h"
 
 /* -----------------------------------------------------------
   Definition of page queues for each block size
@@ -30,7 +30,7 @@ terms of the MIT license. A copy of the license can be found in the file
 
 // Index a block in a page
 static inline mi_block_t* mi_page_block_at(const mi_page_t* page, void* page_start, size_t block_size, size_t i) {
-  UNUSED(page);
+  MI_UNUSED(page);
   mi_assert_internal(page != NULL);
   mi_assert_internal(i <= page->reserved);
   return (mi_block_t*)((uint8_t*)page_start + (i * block_size));
@@ -59,42 +59,54 @@ static inline uint8_t* mi_page_area(const mi_page_t* page) {
 
 static bool mi_page_list_is_valid(mi_page_t* page, mi_block_t* p) {
   size_t psize;
-  uint8_t* page_area = _mi_page_start(_mi_page_segment(page), page, &psize);
+  uint8_t* page_area = _mi_segment_page_start(_mi_page_segment(page), page, &psize);
   mi_block_t* start = (mi_block_t*)page_area;
   mi_block_t* end   = (mi_block_t*)(page_area + psize);
   while(p != NULL) {
     if (p < start || p >= end) return false;
     p = mi_block_next(page, p);
   }
+#if MI_DEBUG>3 // generally too expensive to check this
+  if (page->free_is_zero) {
+    const size_t ubsize = mi_page_usable_block_size(page);
+    for (mi_block_t* block = page->free; block != NULL; block = mi_block_next(page, block)) {
+      mi_assert_expensive(mi_mem_is_zero(block + 1, ubsize - sizeof(mi_block_t)));
+    }
+  }
+#endif
   return true;
 }
 
 static bool mi_page_is_valid_init(mi_page_t* page) {
-  mi_assert_internal(page->xblock_size > 0);
+  mi_assert_internal(mi_page_block_size(page) > 0);
   mi_assert_internal(page->used <= page->capacity);
   mi_assert_internal(page->capacity <= page->reserved);
 
-  const size_t bsize = mi_page_block_size(page);
+  // const size_t bsize = mi_page_block_size(page);
   mi_segment_t* segment = _mi_page_segment(page);
-  uint8_t* start = _mi_page_start(segment,page,NULL);
-  mi_assert_internal(start == _mi_segment_page_start(segment,page,bsize,NULL,NULL));
+  uint8_t* start = mi_page_start(page);
+  mi_assert_internal(start == _mi_segment_page_start(segment,page,NULL));
+  mi_assert_internal(page->is_huge == (segment->page_kind == MI_PAGE_HUGE));
   //mi_assert_internal(start + page->capacity*page->block_size == page->top);
 
   mi_assert_internal(mi_page_list_is_valid(page,page->free));
   mi_assert_internal(mi_page_list_is_valid(page,page->local_free));
 
   #if MI_DEBUG>3 // generally too expensive to check this
-  if (page->flags.is_zero) {
-    for(mi_block_t* block = page->free; block != NULL; mi_block_next(page,block)) {
-      mi_assert_expensive(mi_mem_is_zero(block + 1, page->block_size - sizeof(mi_block_t)));
+  if (page->free_is_zero) {
+    const size_t ubsize = mi_page_usable_block_size(page);
+    for(mi_block_t* block = page->free; block != NULL; block = mi_block_next(page,block)) {
+      mi_assert_expensive(mi_mem_is_zero(block + 1, ubsize - sizeof(mi_block_t)));
     }
   }
   #endif
 
+  #if !MI_TRACK_ENABLED && !MI_TSAN
   mi_block_t* tfree = mi_page_thread_free(page);
   mi_assert_internal(mi_page_list_is_valid(page, tfree));
   //size_t tfree_count = mi_page_list_count(page, tfree);
   //mi_assert_internal(tfree_count <= page->thread_freed + 1);
+  #endif
 
   size_t free_count = mi_page_list_count(page, page->free) + mi_page_list_count(page, page->local_free);
   mi_assert_internal(page->used + free_count == page->capacity);
@@ -102,6 +114,8 @@ static bool mi_page_is_valid_init(mi_page_t* page) {
   return true;
 }
 
+extern bool _mi_process_is_initialized;             // has mi_process_init been called?
+
 bool _mi_page_is_valid(mi_page_t* page) {
   mi_assert_internal(mi_page_is_valid_init(page));
   #if MI_SECURE
@@ -110,7 +124,10 @@ bool _mi_page_is_valid(mi_page_t* page) {
   if (mi_page_heap(page)!=NULL) {
     mi_segment_t* segment = _mi_page_segment(page);
     mi_assert_internal(!_mi_process_is_initialized || segment->thread_id == mi_page_heap(page)->thread_id || segment->thread_id==0);
-    if (segment->page_kind != MI_PAGE_HUGE) {
+    #if MI_HUGE_PAGE_ABANDON
+    if (segment->page_kind != MI_PAGE_HUGE)
+    #endif
+    {
       mi_page_queue_t* pq = mi_page_queue_of(page);
       mi_assert_internal(mi_page_queue_contains(pq, page));
       mi_assert_internal(pq->block_size==mi_page_block_size(page) || mi_page_block_size(page) > MI_LARGE_OBJ_SIZE_MAX || mi_page_is_in_full(page));
@@ -122,14 +139,23 @@ bool _mi_page_is_valid(mi_page_t* page) {
 #endif
 
 void _mi_page_use_delayed_free(mi_page_t* page, mi_delayed_t delay, bool override_never) {
+  while (!_mi_page_try_use_delayed_free(page, delay, override_never)) {
+    mi_atomic_yield();
+  }
+}
+
+bool _mi_page_try_use_delayed_free(mi_page_t* page, mi_delayed_t delay, bool override_never) {
   mi_thread_free_t tfreex;
   mi_delayed_t     old_delay;
-  mi_thread_free_t tfree;  
+  mi_thread_free_t tfree;
+  size_t yield_count = 0;
   do {
     tfree = mi_atomic_load_acquire(&page->xthread_free); // note: must acquire as we can break/repeat this loop and not do a CAS;
     tfreex = mi_tf_set_delayed(tfree, delay);
     old_delay = mi_tf_delayed(tfree);
-    if (mi_unlikely(old_delay == MI_DELAYED_FREEING)) {
+    if mi_unlikely(old_delay == MI_DELAYED_FREEING) {
+      if (yield_count >= 4) return false;  // give up after 4 tries
+      yield_count++;
       mi_atomic_yield(); // delay until outstanding MI_DELAYED_FREEING are done.
       // tfree = mi_tf_set_delayed(tfree, MI_NO_DELAYED_FREE); // will cause CAS to busy fail
     }
@@ -141,6 +167,8 @@ void _mi_page_use_delayed_free(mi_page_t* page, mi_delayed_t delay, bool overrid
     }
   } while ((old_delay == MI_DELAYED_FREEING) ||
            !mi_atomic_cas_weak_release(&page->xthread_free, &tfree, tfreex));
+
+  return true; // success
 }
 
 /* -----------------------------------------------------------
@@ -165,8 +193,8 @@ static void _mi_page_thread_free_collect(mi_page_t* page)
   if (head == NULL) return;
 
   // find the tail -- also to get a proper count (without data races)
-  uint32_t max_count = page->capacity; // cannot collect more than capacity
-  uint32_t count = 1;
+  size_t max_count = page->capacity; // cannot collect more than capacity
+  size_t count = 1;
   mi_block_t* tail = head;
   mi_block_t* next;
   while ((next = mi_block_next(page,tail)) != NULL && count <= max_count) {
@@ -184,7 +212,7 @@ static void _mi_page_thread_free_collect(mi_page_t* page)
   page->local_free = head;
 
   // update counts now
-  page->used -= count;
+  page->used -= (uint16_t)count;
 }
 
 void _mi_page_free_collect(mi_page_t* page, bool force) {
@@ -197,11 +225,11 @@ void _mi_page_free_collect(mi_page_t* page, bool force) {
 
   // and the local free list
   if (page->local_free != NULL) {
-    if (mi_likely(page->free == NULL)) {
+    if mi_likely(page->free == NULL) {
       // usual case
       page->free = page->local_free;
       page->local_free = NULL;
-      page->is_zero = false;
+      page->free_is_zero = false;
     }
     else if (force) {
       // append -- only on shutdown (force) as this is a linear operation
@@ -213,7 +241,7 @@ void _mi_page_free_collect(mi_page_t* page, bool force) {
       mi_block_set_next(page, tail, page->free);
       page->free = page->local_free;
       page->local_free = NULL;
-      page->is_zero = false;
+      page->free_is_zero = false;
     }
   }
 
@@ -231,8 +259,10 @@ void _mi_page_reclaim(mi_heap_t* heap, mi_page_t* page) {
   mi_assert_expensive(mi_page_is_valid_init(page));
   mi_assert_internal(mi_page_heap(page) == heap);
   mi_assert_internal(mi_page_thread_free_flag(page) != MI_NEVER_DELAYED_FREE);
+  #if MI_HUGE_PAGE_ABANDON
   mi_assert_internal(_mi_page_segment(page)->page_kind != MI_PAGE_HUGE);
-  mi_assert_internal(!page->is_reset);
+  #endif
+
   // TODO: push on full queue immediately if it is full?
   mi_page_queue_t* pq = mi_page_queue(heap, mi_page_block_size(page));
   mi_page_queue_push(heap, pq, page);
@@ -240,19 +270,27 @@ void _mi_page_reclaim(mi_heap_t* heap, mi_page_t* page) {
 }
 
 // allocate a fresh page from a segment
-static mi_page_t* mi_page_fresh_alloc(mi_heap_t* heap, mi_page_queue_t* pq, size_t block_size) {
-  mi_assert_internal(pq==NULL||mi_heap_contains_queue(heap, pq));
-  mi_assert_internal(pq==NULL||block_size == pq->block_size);
-  mi_page_t* page = _mi_segment_page_alloc(heap, block_size, &heap->tld->segments, &heap->tld->os);
+static mi_page_t* mi_page_fresh_alloc(mi_heap_t* heap, mi_page_queue_t* pq, size_t block_size, size_t page_alignment) {
+  #if !MI_HUGE_PAGE_ABANDON
+  mi_assert_internal(pq != NULL);
+  mi_assert_internal(mi_heap_contains_queue(heap, pq));
+  mi_assert_internal(page_alignment > 0 || block_size > MI_LARGE_OBJ_SIZE_MAX || block_size == pq->block_size);
+  #endif
+  mi_page_t* page = _mi_segment_page_alloc(heap, block_size, page_alignment, &heap->tld->segments, &heap->tld->os);
   if (page == NULL) {
     // this may be out-of-memory, or an abandoned page was reclaimed (and in our queue)
     return NULL;
   }
-  // a fresh page was found, initialize it
+  #if MI_HUGE_PAGE_ABANDON
   mi_assert_internal(pq==NULL || _mi_page_segment(page)->page_kind != MI_PAGE_HUGE);
-  mi_page_init(heap, page, block_size, heap->tld);
-  _mi_stat_increase(&heap->tld->stats.pages, 1);
-  if (pq!=NULL) mi_page_queue_push(heap, pq, page); // huge pages use pq==NULL
+  #endif
+  mi_assert_internal(pq!=NULL || mi_page_block_size(page) >= block_size);
+  // a fresh page was found, initialize it
+  const size_t full_block_size = (pq == NULL || mi_page_is_huge(page) ? mi_page_block_size(page) : block_size); // see also: mi_segment_huge_page_alloc
+  mi_assert_internal(full_block_size >= block_size);
+  mi_page_init(heap, page, full_block_size, heap->tld);
+  mi_heap_stat_increase(heap, pages, 1);
+  if (pq != NULL) { mi_page_queue_push(heap, pq, page); }
   mi_assert_expensive(_mi_page_is_valid(page));
   return page;
 }
@@ -260,7 +298,7 @@ static mi_page_t* mi_page_fresh_alloc(mi_heap_t* heap, mi_page_queue_t* pq, size
 // Get a fresh page to use
 static mi_page_t* mi_page_fresh(mi_heap_t* heap, mi_page_queue_t* pq) {
   mi_assert_internal(mi_heap_contains_queue(heap, pq));
-  mi_page_t* page = mi_page_fresh_alloc(heap, pq, pq->block_size);
+  mi_page_t* page = mi_page_fresh_alloc(heap, pq, pq->block_size, 0);
   if (page==NULL) return NULL;
   mi_assert_internal(pq->block_size==mi_page_block_size(page));
   mi_assert_internal(pq==mi_page_queue(heap, mi_page_block_size(page)));
@@ -271,10 +309,18 @@ static mi_page_t* mi_page_fresh(mi_heap_t* heap, mi_page_queue_t* pq) {
    Do any delayed frees
    (put there by other threads if they deallocated in a full page)
 ----------------------------------------------------------- */
-void _mi_heap_delayed_free(mi_heap_t* heap) {
+void _mi_heap_delayed_free_all(mi_heap_t* heap) {
+  while (!_mi_heap_delayed_free_partial(heap)) {
+    mi_atomic_yield();
+  }
+}
+
+// returns true if all delayed frees were processed
+bool _mi_heap_delayed_free_partial(mi_heap_t* heap) {
   // take over the list (note: no atomic exchange since it is often NULL)
   mi_block_t* block = mi_atomic_load_ptr_relaxed(mi_block_t, &heap->thread_delayed_free);
   while (block != NULL && !mi_atomic_cas_ptr_weak_acq_rel(mi_block_t, &heap->thread_delayed_free, &block, NULL)) { /* nothing */ };
+  bool all_freed = true;
 
   // and free them all
   while(block != NULL) {
@@ -282,7 +328,9 @@ void _mi_heap_delayed_free(mi_heap_t* heap) {
     // use internal free instead of regular one to keep stats etc correct
     if (!_mi_free_delayed_block(block)) {
       // we might already start delayed freeing while another thread has not yet
-      // reset the delayed_freeing flag; in that case delay it further by reinserting.
+      // reset the delayed_freeing flag; in that case delay it further by reinserting the current block
+      // into the delayed free list
+      all_freed = false;
       mi_block_t* dfree = mi_atomic_load_ptr_relaxed(mi_block_t, &heap->thread_delayed_free);
       do {
         mi_block_set_nextx(heap, block, dfree, heap->keys);
@@ -290,6 +338,7 @@ void _mi_heap_delayed_free(mi_heap_t* heap) {
     }
     block = next;
   }
+  return all_freed;
 }
 
 /* -----------------------------------------------------------
@@ -342,7 +391,7 @@ void _mi_page_abandon(mi_page_t* page, mi_page_queue_t* pq) {
   mi_assert_internal(mi_page_thread_free_flag(page)==MI_NEVER_DELAYED_FREE);
   mi_page_set_heap(page, NULL);
 
-#if MI_DEBUG>1
+#if (MI_DEBUG>1) && !MI_TRACK_ENABLED
   // check there are no references left..
   for (mi_block_t* block = (mi_block_t*)pheap->thread_delayed_free; block != NULL; block = mi_block_nextx(pheap, block, pheap->keys)) {
     mi_assert_internal(_mi_ptr_page(block) != page);
@@ -376,8 +425,8 @@ void _mi_page_free(mi_page_t* page, mi_page_queue_t* pq, bool force) {
   _mi_segment_page_free(page, force, segments_tld);
 }
 
-#define MI_MAX_RETIRE_SIZE    MI_LARGE_OBJ_SIZE_MAX  
-#define MI_RETIRE_CYCLES      (8)
+#define MI_MAX_RETIRE_SIZE    MI_LARGE_OBJ_SIZE_MAX   // should be less than size for MI_BIN_HUGE
+#define MI_RETIRE_CYCLES      (16)
 
 // Retire a page with no more used blocks
 // Important to not retire too quickly though as new
@@ -385,7 +434,7 @@ void _mi_page_free(mi_page_t* page, mi_page_queue_t* pq, bool force) {
 // Note: called from `mi_free` and benchmarks often
 // trigger this due to freeing everything and then
 // allocating again so careful when changing this.
-void _mi_page_retire(mi_page_t* page) {
+void _mi_page_retire(mi_page_t* page) mi_attr_noexcept {
   mi_assert_internal(page != NULL);
   mi_assert_expensive(_mi_page_is_valid(page));
   mi_assert_internal(mi_page_all_free(page));
@@ -399,10 +448,11 @@ void _mi_page_retire(mi_page_t* page) {
   // how to check this efficiently though...
   // for now, we don't retire if it is the only page left of this size class.
   mi_page_queue_t* pq = mi_page_queue_of(page);
-  if (mi_likely(page->xblock_size <= MI_MAX_RETIRE_SIZE && !mi_page_is_in_full(page))) {
+  const size_t bsize = mi_page_block_size(page);
+  if mi_likely( /* bsize < MI_MAX_RETIRE_SIZE && */ !mi_page_queue_is_special(pq)) {  // not full or huge queue?
     if (pq->last==page && pq->first==page) { // the only page in the queue?
       mi_stat_counter_increase(_mi_stats_main.page_no_retire,1);
-      page->retire_expire = (page->xblock_size <= MI_SMALL_OBJ_SIZE_MAX ? MI_RETIRE_CYCLES : MI_RETIRE_CYCLES/4);
+      page->retire_expire = (bsize <= MI_SMALL_OBJ_SIZE_MAX ? MI_RETIRE_CYCLES : MI_RETIRE_CYCLES/4);
       mi_heap_t* heap = mi_page_heap(page);
       mi_assert_internal(pq >= heap->pages);
       const size_t index = pq - heap->pages;
@@ -410,7 +460,7 @@ void _mi_page_retire(mi_page_t* page) {
       if (index < heap->page_retired_min) heap->page_retired_min = index;
       if (index > heap->page_retired_max) heap->page_retired_max = index;
       mi_assert_internal(mi_page_all_free(page));
-      return; // dont't free after all
+      return; // don't free after all
     }
   }
 
@@ -458,14 +508,14 @@ void _mi_heap_collect_retired(mi_heap_t* heap, bool force) {
 #define MI_MIN_SLICES       (2)
 
 static void mi_page_free_list_extend_secure(mi_heap_t* const heap, mi_page_t* const page, const size_t bsize, const size_t extend, mi_stats_t* const stats) {
-  UNUSED(stats);
+  MI_UNUSED(stats);
   #if (MI_SECURE<=2)
   mi_assert_internal(page->free == NULL);
   mi_assert_internal(page->local_free == NULL);
   #endif
   mi_assert_internal(page->capacity + extend <= page->reserved);
   mi_assert_internal(bsize == mi_page_block_size(page));
-  void* const page_area = _mi_page_start(_mi_page_segment(page), page, NULL);
+  void* const page_area = mi_page_start(page);
 
   // initialize a randomized free list
   // set up `slice_count` slices to alternate between
@@ -516,14 +566,14 @@ static void mi_page_free_list_extend_secure(mi_heap_t* const heap, mi_page_t* co
 
 static mi_decl_noinline void mi_page_free_list_extend( mi_page_t* const page, const size_t bsize, const size_t extend, mi_stats_t* const stats)
 {
-  UNUSED(stats);
+  MI_UNUSED(stats);
   #if (MI_SECURE <= 2)
   mi_assert_internal(page->free == NULL);
   mi_assert_internal(page->local_free == NULL);
   #endif
   mi_assert_internal(page->capacity + extend <= page->reserved);
   mi_assert_internal(bsize == mi_page_block_size(page));
-  void* const page_area = _mi_page_start(_mi_page_segment(page), page, NULL );
+  void* const page_area = mi_page_start(page);
 
   mi_block_t* const start = mi_page_block_at(page, page_area, bsize, page->capacity);
 
@@ -566,20 +616,23 @@ static void mi_page_extend_free(mi_heap_t* heap, mi_page_t* page, mi_tld_t* tld)
   if (page->capacity >= page->reserved) return;
 
   size_t page_size;
-  //uint8_t* page_start = 
-  _mi_page_start(_mi_page_segment(page), page, &page_size);
+  //uint8_t* page_start =
+  _mi_segment_page_start(_mi_page_segment(page), page, &page_size);
   mi_stat_counter_increase(tld->stats.pages_extended, 1);
 
   // calculate the extend count
-  const size_t bsize = (page->xblock_size < MI_HUGE_BLOCK_SIZE ? page->xblock_size : page_size);
+  const size_t bsize = mi_page_block_size(page);
   size_t extend = page->reserved - page->capacity;
-  size_t max_extend = (bsize >= MI_MAX_EXTEND_SIZE ? MI_MIN_EXTEND : MI_MAX_EXTEND_SIZE/(uint32_t)bsize);
-  if (max_extend < MI_MIN_EXTEND) max_extend = MI_MIN_EXTEND;
+  mi_assert_internal(extend > 0);
+
+  size_t max_extend = (bsize >= MI_MAX_EXTEND_SIZE ? MI_MIN_EXTEND : MI_MAX_EXTEND_SIZE/bsize);
+  if (max_extend < MI_MIN_EXTEND) { max_extend = MI_MIN_EXTEND; }
+  mi_assert_internal(max_extend > 0);
 
   if (extend > max_extend) {
     // ensure we don't touch memory beyond the page to reduce page commit.
     // the `lean` benchmark tests this. Going from 1 to 8 increases rss by 50%.
-    extend = (max_extend==0 ? 1 : max_extend);
+    extend = max_extend;
   }
 
   mi_assert_internal(extend > 0 && extend + page->capacity <= page->reserved);
@@ -595,11 +648,6 @@ static void mi_page_extend_free(mi_heap_t* heap, mi_page_t* page, mi_tld_t* tld)
   // enable the new free list
   page->capacity += (uint16_t)extend;
   mi_stat_increase(tld->stats.page_committed, extend * bsize);
-
-  // extension into zero initialized memory preserves the zero'd free list
-  if (!page->is_zero_init) {
-    page->is_zero = false;
-  }
   mi_assert_expensive(mi_page_is_valid_init(page));
 }
 
@@ -611,16 +659,30 @@ static void mi_page_init(mi_heap_t* heap, mi_page_t* page, size_t block_size, mi
   mi_assert_internal(block_size > 0);
   // set fields
   mi_page_set_heap(page, heap);
+  page->block_size = block_size;
   size_t page_size;
-  _mi_segment_page_start(segment, page, block_size, &page_size, NULL);
-  page->xblock_size = (block_size < MI_HUGE_BLOCK_SIZE ? (uint32_t)block_size : MI_HUGE_BLOCK_SIZE);
+  page->page_start = _mi_segment_page_start(segment, page, &page_size);
+  mi_track_mem_noaccess(page->page_start,page_size);
   mi_assert_internal(page_size / block_size < (1L<<16));
   page->reserved = (uint16_t)(page_size / block_size);
-  #ifdef MI_ENCODE_FREELIST
+  mi_assert_internal(page->reserved > 0);
+  #if (MI_PADDING || MI_ENCODE_FREELIST)
   page->keys[0] = _mi_heap_random_next(heap);
   page->keys[1] = _mi_heap_random_next(heap);
   #endif
-  page->is_zero = page->is_zero_init;
+  page->free_is_zero = page->is_zero_init;
+  #if MI_DEBUG>2
+  if (page->is_zero_init) {
+    mi_track_mem_defined(page->page_start, page_size);
+    mi_assert_expensive(mi_mem_is_zero(page->page_start, page_size));
+  }
+  #endif
+  if (block_size > 0 && _mi_is_power_of_two(block_size)) {
+    page->block_size_shift = (uint8_t)(mi_ctz((uintptr_t)block_size));
+  }
+  else {
+    page->block_size_shift = 0;
+  }
 
   mi_assert_internal(page->capacity == 0);
   mi_assert_internal(page->free == NULL);
@@ -630,10 +692,11 @@ static void mi_page_init(mi_heap_t* heap, mi_page_t* page, size_t block_size, mi
   mi_assert_internal(page->prev == NULL);
   mi_assert_internal(page->retire_expire == 0);
   mi_assert_internal(!mi_page_has_aligned(page));
-  #if (MI_ENCODE_FREELIST)
+  #if (MI_PADDING || MI_ENCODE_FREELIST)
   mi_assert_internal(page->keys[0] != 0);
   mi_assert_internal(page->keys[1] != 0);
   #endif
+  mi_assert_internal(page->block_size_shift == 0 || (block_size == ((size_t)1 << page->block_size_shift)));
   mi_assert_expensive(mi_page_is_valid_init(page));
 
   // initialize an initial free list
@@ -650,12 +713,16 @@ static void mi_page_init(mi_heap_t* heap, mi_page_t* page, size_t block_size, mi
 static mi_page_t* mi_page_queue_find_free_ex(mi_heap_t* heap, mi_page_queue_t* pq, bool first_try)
 {
   // search through the pages in "next fit" order
+  #if MI_STAT
   size_t count = 0;
+  #endif
   mi_page_t* page = pq->first;
   while (page != NULL)
   {
     mi_page_t* next = page->next; // remember next
+    #if MI_STAT
     count++;
+    #endif
 
     // 0. collect freed blocks by us and other threads
     _mi_page_free_collect(page, false);
@@ -680,14 +747,14 @@ static mi_page_t* mi_page_queue_find_free_ex(mi_heap_t* heap, mi_page_queue_t* p
     page = next;
   } // for each page
 
-  mi_stat_counter_increase(heap->tld->stats.searches, count);
+  mi_heap_stat_counter_increase(heap, searches, count);
 
   if (page == NULL) {
     _mi_heap_collect_retired(heap, false); // perhaps make a page available
     page = mi_page_fresh(heap, pq);
     if (page == NULL && first_try) {
       // out-of-memory _or_ an abandoned page with free blocks was reclaimed, try once again
-      page = mi_page_queue_find_free_ex(heap, pq, false);      
+      page = mi_page_queue_find_free_ex(heap, pq, false);
     }
   }
   else {
@@ -705,17 +772,17 @@ static inline mi_page_t* mi_find_free_page(mi_heap_t* heap, size_t size) {
   mi_page_queue_t* pq = mi_page_queue(heap,size);
   mi_page_t* page = pq->first;
   if (page != NULL) {
-   #if (MI_SECURE>=3) // in secure mode, we extend half the time to increase randomness      
+   #if (MI_SECURE>=3) // in secure mode, we extend half the time to increase randomness
     if (page->capacity < page->reserved && ((_mi_heap_random_next(heap) & 1) == 1)) {
       mi_page_extend_free(heap, page, heap->tld);
       mi_assert_internal(mi_page_immediate_available(page));
     }
-    else 
+    else
    #endif
     {
       _mi_page_free_collect(page,false);
     }
-    
+
     if (mi_page_immediate_available(page)) {
       page->retire_expire = 0;
       return page; // fast path
@@ -754,31 +821,31 @@ void mi_register_deferred_free(mi_deferred_free_fun* fn, void* arg) mi_attr_noex
   General allocation
 ----------------------------------------------------------- */
 
-// A huge page is allocated directly without being in a queue.
-// Because huge pages contain just one block, and the segment contains
-// just that page, we always treat them as abandoned and any thread
-// that frees the block can free the whole page and segment directly.
-static mi_page_t* mi_huge_page_alloc(mi_heap_t* heap, size_t size) {
+// Huge pages contain just one block, and the segment contains just that page.
+// Huge pages are also use if the requested alignment is very large (> MI_BLOCK_ALIGNMENT_MAX)
+// so their size is not always `> MI_LARGE_OBJ_SIZE_MAX`.
+static mi_page_t* mi_huge_page_alloc(mi_heap_t* heap, size_t size, size_t page_alignment) {
   size_t block_size = _mi_os_good_alloc_size(size);
-  mi_assert_internal(_mi_bin(block_size) == MI_BIN_HUGE);
-  mi_page_t* page = mi_page_fresh_alloc(heap,NULL,block_size);
+  mi_assert_internal(mi_bin(block_size) == MI_BIN_HUGE || page_alignment > 0);
+  #if MI_HUGE_PAGE_ABANDON
+  mi_page_queue_t* pq = NULL;
+  #else
+  mi_page_queue_t* pq = mi_page_queue(heap, MI_LARGE_OBJ_SIZE_MAX+1);  // always in the huge queue regardless of the block size
+  mi_assert_internal(mi_page_queue_is_huge(pq));
+  #endif
+  mi_page_t* page = mi_page_fresh_alloc(heap, pq, block_size, page_alignment);
   if (page != NULL) {
-    const size_t bsize = mi_page_block_size(page);  // note: not `mi_page_usable_block_size` as `size` includes padding already
-    mi_assert_internal(bsize >= size);
+    mi_assert_internal(mi_page_block_size(page) >= size);
     mi_assert_internal(mi_page_immediate_available(page));
-    mi_assert_internal(_mi_page_segment(page)->page_kind==MI_PAGE_HUGE);
+    mi_assert_internal(mi_page_is_huge(page));
+    mi_assert_internal(_mi_page_segment(page)->page_kind == MI_PAGE_HUGE);
     mi_assert_internal(_mi_page_segment(page)->used==1);
+    #if MI_HUGE_PAGE_ABANDON
     mi_assert_internal(_mi_page_segment(page)->thread_id==0); // abandoned, not in the huge queue
     mi_page_set_heap(page, NULL);
-
-    if (bsize > MI_HUGE_OBJ_SIZE_MAX) {
-      _mi_stat_increase(&heap->tld->stats.giant, bsize);
-      _mi_stat_counter_increase(&heap->tld->stats.giant_count, 1);
-    }
-    else {
-      _mi_stat_increase(&heap->tld->stats.huge, bsize);
-      _mi_stat_counter_increase(&heap->tld->stats.huge_count, 1);
-    }
+    #endif
+    mi_heap_stat_increase(heap, huge, mi_page_block_size(page));
+    mi_heap_stat_counter_increase(heap, huge_count, 1);
   }
   return page;
 }
@@ -786,54 +853,57 @@ static mi_page_t* mi_huge_page_alloc(mi_heap_t* heap, size_t size) {
 
 // Allocate a page
 // Note: in debug mode the size includes MI_PADDING_SIZE and might have overflowed.
-static mi_page_t* mi_find_page(mi_heap_t* heap, size_t size) mi_attr_noexcept {
+static mi_page_t* mi_find_page(mi_heap_t* heap, size_t size, size_t huge_alignment) mi_attr_noexcept {
   // huge allocation?
-  const size_t req_size = size - MI_PADDING_SIZE;  // correct for padding_size in case of an overflow on `size`  
-  if (mi_unlikely(req_size > (MI_LARGE_OBJ_SIZE_MAX - MI_PADDING_SIZE) )) {
-    if (mi_unlikely(req_size > PTRDIFF_MAX)) {  // we don't allocate more than PTRDIFF_MAX (see <https://sourceware.org/ml/libc-announce/2019/msg00001.html>)
+  const size_t req_size = size - MI_PADDING_SIZE;  // correct for padding_size in case of an overflow on `size`
+  if mi_unlikely(req_size > (MI_LARGE_OBJ_SIZE_MAX - MI_PADDING_SIZE) || huge_alignment > 0) {
+    if mi_unlikely(req_size > MI_MAX_ALLOC_SIZE) {  
       _mi_error_message(EOVERFLOW, "allocation request is too large (%zu bytes)\n", req_size);
       return NULL;
     }
     else {
-      return mi_huge_page_alloc(heap,size);
+      return mi_huge_page_alloc(heap,size,huge_alignment);
     }
   }
   else {
     // otherwise find a page with free blocks in our size segregated queues
+    #if MI_PADDING
     mi_assert_internal(size >= MI_PADDING_SIZE);
+    #endif
     return mi_find_free_page(heap, size);
   }
 }
 
 // Generic allocation routine if the fast path (`alloc.c:mi_page_malloc`) does not succeed.
 // Note: in debug mode the size includes MI_PADDING_SIZE and might have overflowed.
-void* _mi_malloc_generic(mi_heap_t* heap, size_t size) mi_attr_noexcept
+// The `huge_alignment` is normally 0 but is set to a multiple of MI_SEGMENT_SIZE for
+// very large requested alignments in which case we use a huge segment.
+void* _mi_malloc_generic(mi_heap_t* heap, size_t size, bool zero, size_t huge_alignment) mi_attr_noexcept
 {
   mi_assert_internal(heap != NULL);
 
   // initialize if necessary
-  if (mi_unlikely(!mi_heap_is_initialized(heap))) {
-    mi_thread_init(); // calls `_mi_heap_init` in turn
-    heap = mi_get_default_heap();
-    if (mi_unlikely(!mi_heap_is_initialized(heap))) { return NULL; }
+  if mi_unlikely(!mi_heap_is_initialized(heap)) {
+    heap = mi_heap_get_default(); // calls mi_thread_init
+    if mi_unlikely(!mi_heap_is_initialized(heap)) { return NULL; }
   }
   mi_assert_internal(mi_heap_is_initialized(heap));
 
   // call potential deferred free routines
   _mi_deferred_free(heap, false);
 
-  // free delayed frees from other threads
-  _mi_heap_delayed_free(heap);
+  // free delayed frees from other threads (but skip contended ones)
+  _mi_heap_delayed_free_partial(heap);
 
   // find (or allocate) a page of the right size
-  mi_page_t* page = mi_find_page(heap, size);
-  if (mi_unlikely(page == NULL)) { // first time out of memory, try to collect and retry the allocation once more
+  mi_page_t* page = mi_find_page(heap, size, huge_alignment);
+  if mi_unlikely(page == NULL) { // first time out of memory, try to collect and retry the allocation once more
     mi_heap_collect(heap, true /* force */);
-    page = mi_find_page(heap, size);
+    page = mi_find_page(heap, size, huge_alignment);
   }
 
-  if (mi_unlikely(page == NULL)) { // out of memory
-    const size_t req_size = size - MI_PADDING_SIZE;  // correct for padding_size in case of an overflow on `size`  
+  if mi_unlikely(page == NULL) { // out of memory
+    const size_t req_size = size - MI_PADDING_SIZE;  // correct for padding_size in case of an overflow on `size`
     _mi_error_message(ENOMEM, "unable to allocate memory (%zu bytes)\n", req_size);
     return NULL;
   }
@@ -841,6 +911,15 @@ void* _mi_malloc_generic(mi_heap_t* heap, size_t size) mi_attr_noexcept
   mi_assert_internal(mi_page_immediate_available(page));
   mi_assert_internal(mi_page_block_size(page) >= size);
 
-  // and try again, this time succeeding! (i.e. this should never recurse)
-  return _mi_page_malloc(heap, page, size);
+  // and try again, this time succeeding! (i.e. this should never recurse through _mi_page_malloc)
+  if mi_unlikely(zero && page->block_size == 0) {
+    // note: we cannot call _mi_page_malloc with zeroing for huge blocks; we zero it afterwards in that case.
+    void* p = _mi_page_malloc(heap, page, size);
+    mi_assert_internal(p != NULL);
+    _mi_memzero_aligned(p, mi_page_usable_block_size(page));
+    return p;
+  }
+  else {
+    return _mi_page_malloc_zero(heap, page, size, zero);
+  }
 }
diff --git a/contrib/libs/mimalloc/src/prim/osx/alloc-override-zone.c b/contrib/libs/mimalloc/src/prim/osx/alloc-override-zone.c
new file mode 100644
index 000000000000..1515b886b20b
--- /dev/null
+++ b/contrib/libs/mimalloc/src/prim/osx/alloc-override-zone.c
@@ -0,0 +1,461 @@
+/* ----------------------------------------------------------------------------
+Copyright (c) 2018-2022, Microsoft Research, Daan Leijen
+This is free software; you can redistribute it and/or modify it under the
+terms of the MIT license. A copy of the license can be found in the file
+"LICENSE" at the root of this distribution.
+-----------------------------------------------------------------------------*/
+
+#include "mimalloc.h"
+#include "mimalloc/internal.h"
+
+#if defined(MI_MALLOC_OVERRIDE)
+
+#if !defined(__APPLE__)
+#error "this file should only be included on macOS"
+#endif
+
+/* ------------------------------------------------------
+   Override system malloc on macOS
+   This is done through the malloc zone interface.
+   It seems to be most robust in combination with interposing
+   though or otherwise we may get zone errors as there are could
+   be allocations done by the time we take over the
+   zone.
+------------------------------------------------------ */
+
+#include <AvailabilityMacros.h>
+#include <malloc/malloc.h>
+#include <string.h>  // memset
+#include <stdlib.h>
+
+#ifdef __cplusplus
+extern "C" {
+#endif
+
+#if defined(MAC_OS_X_VERSION_10_6) && (MAC_OS_X_VERSION_MAX_ALLOWED >= MAC_OS_X_VERSION_10_6)
+// only available from OSX 10.6
+extern malloc_zone_t* malloc_default_purgeable_zone(void) __attribute__((weak_import));
+#endif
+
+/* ------------------------------------------------------
+   malloc zone members
+------------------------------------------------------ */
+
+static size_t zone_size(malloc_zone_t* zone, const void* p) {
+  MI_UNUSED(zone);
+  if (!mi_is_in_heap_region(p)){ return 0; } // not our pointer, bail out
+  return mi_usable_size(p);
+}
+
+static void* zone_malloc(malloc_zone_t* zone, size_t size) {
+  MI_UNUSED(zone);
+  return mi_malloc(size);
+}
+
+static void* zone_calloc(malloc_zone_t* zone, size_t count, size_t size) {
+  MI_UNUSED(zone);
+  return mi_calloc(count, size);
+}
+
+static void* zone_valloc(malloc_zone_t* zone, size_t size) {
+  MI_UNUSED(zone);
+  return mi_malloc_aligned(size, _mi_os_page_size());
+}
+
+static void zone_free(malloc_zone_t* zone, void* p) {
+  MI_UNUSED(zone);
+  mi_cfree(p);
+}
+
+static void* zone_realloc(malloc_zone_t* zone, void* p, size_t newsize) {
+  MI_UNUSED(zone);
+  return mi_realloc(p, newsize);
+}
+
+static void* zone_memalign(malloc_zone_t* zone, size_t alignment, size_t size) {
+  MI_UNUSED(zone);
+  return mi_malloc_aligned(size,alignment);
+}
+
+static void zone_destroy(malloc_zone_t* zone) {
+  MI_UNUSED(zone);
+  // todo: ignore for now?
+}
+
+static unsigned zone_batch_malloc(malloc_zone_t* zone, size_t size, void** ps, unsigned count) {
+  size_t i;
+  for (i = 0; i < count; i++) {
+    ps[i] = zone_malloc(zone, size);
+    if (ps[i] == NULL) break;
+  }
+  return i;
+}
+
+static void zone_batch_free(malloc_zone_t* zone, void** ps, unsigned count) {
+  for(size_t i = 0; i < count; i++) {
+    zone_free(zone, ps[i]);
+    ps[i] = NULL;
+  }
+}
+
+static size_t zone_pressure_relief(malloc_zone_t* zone, size_t size) {
+  MI_UNUSED(zone); MI_UNUSED(size);
+  mi_collect(false);
+  return 0;
+}
+
+static void zone_free_definite_size(malloc_zone_t* zone, void* p, size_t size) {
+  MI_UNUSED(size);
+  zone_free(zone,p);
+}
+
+static boolean_t zone_claimed_address(malloc_zone_t* zone, void* p) {
+  MI_UNUSED(zone);
+  return mi_is_in_heap_region(p);
+}
+
+
+/* ------------------------------------------------------
+   Introspection members
+------------------------------------------------------ */
+
+static kern_return_t intro_enumerator(task_t task, void* p,
+                            unsigned type_mask, vm_address_t zone_address,
+                            memory_reader_t reader,
+                            vm_range_recorder_t recorder)
+{
+  // todo: enumerate all memory
+  MI_UNUSED(task); MI_UNUSED(p); MI_UNUSED(type_mask); MI_UNUSED(zone_address);
+  MI_UNUSED(reader); MI_UNUSED(recorder);
+  return KERN_SUCCESS;
+}
+
+static size_t intro_good_size(malloc_zone_t* zone, size_t size) {
+  MI_UNUSED(zone);
+  return mi_good_size(size);
+}
+
+static boolean_t intro_check(malloc_zone_t* zone) {
+  MI_UNUSED(zone);
+  return true;
+}
+
+static void intro_print(malloc_zone_t* zone, boolean_t verbose) {
+  MI_UNUSED(zone); MI_UNUSED(verbose);
+  mi_stats_print(NULL);
+}
+
+static void intro_log(malloc_zone_t* zone, void* p) {
+  MI_UNUSED(zone); MI_UNUSED(p);
+  // todo?
+}
+
+static void intro_force_lock(malloc_zone_t* zone) {
+  MI_UNUSED(zone);
+  // todo?
+}
+
+static void intro_force_unlock(malloc_zone_t* zone) {
+  MI_UNUSED(zone);
+  // todo?
+}
+
+static void intro_statistics(malloc_zone_t* zone, malloc_statistics_t* stats) {
+  MI_UNUSED(zone);
+  // todo...
+  stats->blocks_in_use = 0;
+  stats->size_in_use = 0;
+  stats->max_size_in_use = 0;
+  stats->size_allocated = 0;
+}
+
+static boolean_t intro_zone_locked(malloc_zone_t* zone) {
+  MI_UNUSED(zone);
+  return false;
+}
+
+
+/* ------------------------------------------------------
+  At process start, override the default allocator
+------------------------------------------------------ */
+
+#if defined(__GNUC__) && !defined(__clang__)
+#pragma GCC diagnostic ignored "-Wmissing-field-initializers"
+#endif
+
+#if defined(__clang__)
+#pragma clang diagnostic ignored "-Wc99-extensions"
+#endif
+
+static malloc_introspection_t mi_introspect = {
+  .enumerator = &intro_enumerator,
+  .good_size = &intro_good_size,
+  .check = &intro_check,
+  .print = &intro_print,
+  .log = &intro_log,
+  .force_lock = &intro_force_lock,
+  .force_unlock = &intro_force_unlock,
+#if defined(MAC_OS_X_VERSION_10_6) && (MAC_OS_X_VERSION_MAX_ALLOWED >= MAC_OS_X_VERSION_10_6) && !defined(__ppc__)
+  .statistics = &intro_statistics,
+  .zone_locked = &intro_zone_locked,
+#endif
+};
+
+static malloc_zone_t mi_malloc_zone = {
+  // note: even with designators, the order is important for C++ compilation
+  //.reserved1 = NULL,
+  //.reserved2 = NULL,
+  .size = &zone_size,
+  .malloc = &zone_malloc,
+  .calloc = &zone_calloc,
+  .valloc = &zone_valloc,
+  .free = &zone_free,
+  .realloc = &zone_realloc,
+  .destroy = &zone_destroy,
+  .zone_name = "mimalloc",
+  .batch_malloc = &zone_batch_malloc,
+  .batch_free = &zone_batch_free,
+  .introspect = &mi_introspect,
+#if defined(MAC_OS_X_VERSION_10_6) && (MAC_OS_X_VERSION_MAX_ALLOWED >= MAC_OS_X_VERSION_10_6) && !defined(__ppc__)
+  #if defined(MAC_OS_X_VERSION_10_14) && (MAC_OS_X_VERSION_MAX_ALLOWED >= MAC_OS_X_VERSION_10_14)
+  .version = 10,
+  #else
+  .version = 9,
+  #endif
+  // switch to version 9+ on OSX 10.6 to support memalign.
+  .memalign = &zone_memalign,
+  .free_definite_size = &zone_free_definite_size,
+  #if defined(MAC_OS_X_VERSION_10_7) && (MAC_OS_X_VERSION_MAX_ALLOWED >= MAC_OS_X_VERSION_10_7)
+  .pressure_relief = &zone_pressure_relief,
+  #endif
+  #if defined(MAC_OS_X_VERSION_10_14) && (MAC_OS_X_VERSION_MAX_ALLOWED >= MAC_OS_X_VERSION_10_14)
+  .claimed_address = &zone_claimed_address,
+  #endif
+#else
+  .version = 4,
+#endif
+};
+
+#ifdef __cplusplus
+}
+#endif
+
+
+#if defined(MI_OSX_INTERPOSE) && defined(MI_SHARED_LIB_EXPORT)
+
+// ------------------------------------------------------
+// Override malloc_xxx and malloc_zone_xxx api's to use only
+// our mimalloc zone. Since even the loader uses malloc
+// on macOS, this ensures that all allocations go through
+// mimalloc (as all calls are interposed).
+// The main `malloc`, `free`, etc calls are interposed in `alloc-override.c`,
+// Here, we also override macOS specific API's like
+// `malloc_zone_calloc` etc. see <https://github.com/aosm/libmalloc/blob/master/man/malloc_zone_malloc.3>
+// ------------------------------------------------------
+
+static inline malloc_zone_t* mi_get_default_zone(void)
+{
+  static bool init;
+  if mi_unlikely(!init) {
+    init = true;
+    malloc_zone_register(&mi_malloc_zone);  // by calling register we avoid a zone error on free (see <http://eatmyrandom.blogspot.com/2010/03/mallocfree-interception-on-mac-os-x.html>)
+  }
+  return &mi_malloc_zone;
+}
+
+mi_decl_externc int  malloc_jumpstart(uintptr_t cookie);
+mi_decl_externc void _malloc_fork_prepare(void);
+mi_decl_externc void _malloc_fork_parent(void);
+mi_decl_externc void _malloc_fork_child(void);
+
+
+static malloc_zone_t* mi_malloc_create_zone(vm_size_t size, unsigned flags) {
+  MI_UNUSED(size); MI_UNUSED(flags);
+  return mi_get_default_zone();
+}
+
+static malloc_zone_t* mi_malloc_default_zone (void) {
+  return mi_get_default_zone();
+}
+
+static malloc_zone_t* mi_malloc_default_purgeable_zone(void) {
+  return mi_get_default_zone();
+}
+
+static void mi_malloc_destroy_zone(malloc_zone_t* zone) {
+  MI_UNUSED(zone);
+  // nothing.
+}
+
+static kern_return_t mi_malloc_get_all_zones (task_t task, memory_reader_t mr, vm_address_t** addresses, unsigned* count) {
+  MI_UNUSED(task); MI_UNUSED(mr);
+  if (addresses != NULL) *addresses = NULL;
+  if (count != NULL) *count = 0;
+  return KERN_SUCCESS;
+}
+
+static const char* mi_malloc_get_zone_name(malloc_zone_t* zone) {
+  return (zone == NULL ? mi_malloc_zone.zone_name : zone->zone_name);
+}
+
+static void mi_malloc_set_zone_name(malloc_zone_t* zone, const char* name) {
+  MI_UNUSED(zone); MI_UNUSED(name);
+}
+
+static int mi_malloc_jumpstart(uintptr_t cookie) {
+  MI_UNUSED(cookie);
+  return 1; // or 0 for no error?
+}
+
+static void mi__malloc_fork_prepare(void) {
+  // nothing
+}
+static void mi__malloc_fork_parent(void) {
+  // nothing
+}
+static void mi__malloc_fork_child(void) {
+  // nothing
+}
+
+static void mi_malloc_printf(const char* fmt, ...) {
+  MI_UNUSED(fmt);
+}
+
+static bool zone_check(malloc_zone_t* zone) {
+  MI_UNUSED(zone);
+  return true;
+}
+
+static malloc_zone_t* zone_from_ptr(const void* p) {
+  MI_UNUSED(p);
+  return mi_get_default_zone();
+}
+
+static void zone_log(malloc_zone_t* zone, void* p) {
+  MI_UNUSED(zone); MI_UNUSED(p);
+}
+
+static void zone_print(malloc_zone_t* zone, bool b) {
+  MI_UNUSED(zone); MI_UNUSED(b);
+}
+
+static void zone_print_ptr_info(void* p) {
+  MI_UNUSED(p);
+}
+
+static void zone_register(malloc_zone_t* zone) {
+  MI_UNUSED(zone);
+}
+
+static void zone_unregister(malloc_zone_t* zone) {
+  MI_UNUSED(zone);
+}
+
+// use interposing so `DYLD_INSERT_LIBRARIES` works without `DYLD_FORCE_FLAT_NAMESPACE=1`
+// See: <https://books.google.com/books?id=K8vUkpOXhN4C&pg=PA73>
+struct mi_interpose_s {
+  const void* replacement;
+  const void* target;
+};
+#define MI_INTERPOSE_FUN(oldfun,newfun) { (const void*)&newfun, (const void*)&oldfun }
+#define MI_INTERPOSE_MI(fun)            MI_INTERPOSE_FUN(fun,mi_##fun)
+#define MI_INTERPOSE_ZONE(fun)          MI_INTERPOSE_FUN(malloc_##fun,fun)
+__attribute__((used)) static const struct mi_interpose_s _mi_zone_interposes[]  __attribute__((section("__DATA, __interpose"))) =
+{
+
+  MI_INTERPOSE_MI(malloc_create_zone),
+  MI_INTERPOSE_MI(malloc_default_purgeable_zone),
+  MI_INTERPOSE_MI(malloc_default_zone),
+  MI_INTERPOSE_MI(malloc_destroy_zone),
+  MI_INTERPOSE_MI(malloc_get_all_zones),
+  MI_INTERPOSE_MI(malloc_get_zone_name),
+  MI_INTERPOSE_MI(malloc_jumpstart),
+  MI_INTERPOSE_MI(malloc_printf),
+  MI_INTERPOSE_MI(malloc_set_zone_name),
+  MI_INTERPOSE_MI(_malloc_fork_child),
+  MI_INTERPOSE_MI(_malloc_fork_parent),
+  MI_INTERPOSE_MI(_malloc_fork_prepare),
+
+  MI_INTERPOSE_ZONE(zone_batch_free),
+  MI_INTERPOSE_ZONE(zone_batch_malloc),
+  MI_INTERPOSE_ZONE(zone_calloc),
+  MI_INTERPOSE_ZONE(zone_check),
+  MI_INTERPOSE_ZONE(zone_free),
+  MI_INTERPOSE_ZONE(zone_from_ptr),
+  MI_INTERPOSE_ZONE(zone_log),
+  MI_INTERPOSE_ZONE(zone_malloc),
+  MI_INTERPOSE_ZONE(zone_memalign),
+  MI_INTERPOSE_ZONE(zone_print),
+  MI_INTERPOSE_ZONE(zone_print_ptr_info),
+  MI_INTERPOSE_ZONE(zone_realloc),
+  MI_INTERPOSE_ZONE(zone_register),
+  MI_INTERPOSE_ZONE(zone_unregister),
+  MI_INTERPOSE_ZONE(zone_valloc)
+};
+
+
+#else
+
+// ------------------------------------------------------
+// hook into the zone api's without interposing
+// This is the official way of adding an allocator but
+// it seems less robust than using interpose.
+// ------------------------------------------------------
+
+static inline malloc_zone_t* mi_get_default_zone(void)
+{
+  // The first returned zone is the real default
+  malloc_zone_t** zones = NULL;
+  unsigned count = 0;
+  kern_return_t ret = malloc_get_all_zones(0, NULL, (vm_address_t**)&zones, &count);
+  if (ret == KERN_SUCCESS && count > 0) {
+    return zones[0];
+  }
+  else {
+    // fallback
+    return malloc_default_zone();
+  }
+}
+
+#if defined(__clang__)
+__attribute__((constructor(0)))
+#else
+__attribute__((constructor))      // seems not supported by g++-11 on the M1
+#endif
+__attribute__((used))
+static void _mi_macos_override_malloc(void) {
+  malloc_zone_t* purgeable_zone = NULL;
+
+  #if defined(MAC_OS_X_VERSION_10_6) && (MAC_OS_X_VERSION_MAX_ALLOWED >= MAC_OS_X_VERSION_10_6)
+  // force the purgeable zone to exist to avoid strange bugs
+  if (malloc_default_purgeable_zone) {
+    purgeable_zone = malloc_default_purgeable_zone();
+  }
+  #endif
+
+  // Register our zone.
+  // thomcc: I think this is still needed to put us in the zone list.
+  malloc_zone_register(&mi_malloc_zone);
+  // Unregister the default zone, this makes our zone the new default
+  // as that was the last registered.
+  malloc_zone_t *default_zone = mi_get_default_zone();
+  // thomcc: Unsure if the next test is *always* false or just false in the
+  // cases I've tried. I'm also unsure if the code inside is needed. at all
+  if (default_zone != &mi_malloc_zone) {
+    malloc_zone_unregister(default_zone);
+
+    // Reregister the default zone so free and realloc in that zone keep working.
+    malloc_zone_register(default_zone);
+  }
+
+  // Unregister, and re-register the purgeable_zone to avoid bugs if it occurs
+  // earlier than the default zone.
+  if (purgeable_zone != NULL) {
+    malloc_zone_unregister(purgeable_zone);
+    malloc_zone_register(purgeable_zone);
+  }
+
+}
+#endif  // MI_OSX_INTERPOSE
+
+#endif // MI_MALLOC_OVERRIDE
diff --git a/contrib/libs/mimalloc/src/prim/osx/prim.c b/contrib/libs/mimalloc/src/prim/osx/prim.c
new file mode 100644
index 000000000000..8a2f4e8aa473
--- /dev/null
+++ b/contrib/libs/mimalloc/src/prim/osx/prim.c
@@ -0,0 +1,9 @@
+/* ----------------------------------------------------------------------------
+Copyright (c) 2018-2023, Microsoft Research, Daan Leijen
+This is free software; you can redistribute it and/or modify it under the
+terms of the MIT license. A copy of the license can be found in the file
+"LICENSE" at the root of this distribution.
+-----------------------------------------------------------------------------*/
+
+// We use the unix/prim.c with the mmap API on macOSX
+#include "../unix/prim.c"
diff --git a/contrib/libs/mimalloc/src/prim/prim.c b/contrib/libs/mimalloc/src/prim/prim.c
new file mode 100644
index 000000000000..992a1a4be570
--- /dev/null
+++ b/contrib/libs/mimalloc/src/prim/prim.c
@@ -0,0 +1,27 @@
+/* ----------------------------------------------------------------------------
+Copyright (c) 2018-2023, Microsoft Research, Daan Leijen
+This is free software; you can redistribute it and/or modify it under the
+terms of the MIT license. A copy of the license can be found in the file
+"LICENSE" at the root of this distribution.
+-----------------------------------------------------------------------------*/
+
+// Select the implementation of the primitives
+// depending on the OS.
+
+#if defined(_WIN32)
+#error #include "windows/prim.c"  // VirtualAlloc (Windows)
+
+#elif defined(__APPLE__)
+#include "osx/prim.c"      // macOSX (actually defers to mmap in unix/prim.c)
+
+#elif defined(__wasi__)
+#define MI_USE_SBRK
+#error #include "wasi/prim.c"     // memory-grow or sbrk (Wasm)
+
+#elif defined(__EMSCRIPTEN__)
+#error #include "emscripten/prim.c" // emmalloc_*, + pthread support
+
+#else
+#include "unix/prim.c"     // mmap() (Linux, macOSX, BSD, Illumnos, Haiku, DragonFly, etc.)
+
+#endif
diff --git a/contrib/libs/mimalloc/src/prim/unix/prim.c b/contrib/libs/mimalloc/src/prim/unix/prim.c
new file mode 100644
index 000000000000..7890f936b906
--- /dev/null
+++ b/contrib/libs/mimalloc/src/prim/unix/prim.c
@@ -0,0 +1,882 @@
+/* ----------------------------------------------------------------------------
+Copyright (c) 2018-2023, Microsoft Research, Daan Leijen
+This is free software; you can redistribute it and/or modify it under the
+terms of the MIT license. A copy of the license can be found in the file
+"LICENSE" at the root of this distribution.
+-----------------------------------------------------------------------------*/
+
+// This file is included in `src/prim/prim.c`
+
+#ifndef _DEFAULT_SOURCE
+#define _DEFAULT_SOURCE   // ensure mmap flags and syscall are defined
+#endif
+
+#if defined(__sun)
+// illumos provides new mman.h api when any of these are defined
+// otherwise the old api based on caddr_t which predates the void pointers one.
+// stock solaris provides only the former, chose to atomically to discard those
+// flags only here rather than project wide tough.
+#undef _XOPEN_SOURCE
+#undef _POSIX_C_SOURCE
+#endif
+
+#include "mimalloc.h"
+#include "mimalloc/internal.h"
+#include "mimalloc/atomic.h"
+#include "mimalloc/prim.h"
+
+#include <sys/mman.h>  // mmap
+#include <unistd.h>    // sysconf
+#include <fcntl.h>     // open, close, read, access
+
+#if defined(__linux__)
+  #include <features.h>
+  #if defined(MI_NO_THP)
+  #include <sys/prctl.h>
+  #endif
+  #if defined(__GLIBC__)
+  #include <linux/mman.h> // linux mmap flags
+  #else
+  #include <sys/mman.h>
+  #endif
+#elif defined(__APPLE__)
+  #include <AvailabilityMacros.h>
+  #include <TargetConditionals.h>
+  #if !defined(TARGET_OS_OSX) || TARGET_OS_OSX   // see issue #879, used to be (!TARGET_IOS_IPHONE && !TARGET_IOS_SIMULATOR)
+  #include <mach/vm_statistics.h>    // VM_MAKE_TAG, VM_FLAGS_SUPERPAGE_SIZE_2MB, etc.
+  #endif
+  #if !defined(MAC_OS_X_VERSION_10_7)
+  #define MAC_OS_X_VERSION_10_7   1070
+  #endif
+#elif defined(__FreeBSD__) || defined(__DragonFly__)
+  #include <sys/param.h>
+  #if __FreeBSD_version >= 1200000
+  #include <sys/cpuset.h>
+  #error #include <sys/domainset.h>
+  #endif
+  #include <sys/sysctl.h>
+#endif
+
+#if defined(__linux__) || defined(__FreeBSD__)
+  #define MI_HAS_SYSCALL_H
+  #include <sys/syscall.h>
+#endif
+
+
+//------------------------------------------------------------------------------------
+// Use syscalls for some primitives to allow for libraries that override open/read/close etc.
+// and do allocation themselves; using syscalls prevents recursion when mimalloc is
+// still initializing (issue #713)
+// Declare inline to avoid unused function warnings.
+//------------------------------------------------------------------------------------
+
+#if defined(MI_HAS_SYSCALL_H) && defined(SYS_open) && defined(SYS_close) && defined(SYS_read) && defined(SYS_access)
+
+static inline int mi_prim_open(const char* fpath, int open_flags) {
+  return syscall(SYS_open,fpath,open_flags,0);
+}
+static inline ssize_t mi_prim_read(int fd, void* buf, size_t bufsize) {
+  return syscall(SYS_read,fd,buf,bufsize);
+}
+static inline int mi_prim_close(int fd) {
+  return syscall(SYS_close,fd);
+}
+static inline int mi_prim_access(const char *fpath, int mode) {
+  return syscall(SYS_access,fpath,mode);
+}
+
+#else
+
+static inline int mi_prim_open(const char* fpath, int open_flags) {
+  return open(fpath,open_flags);
+}
+static inline ssize_t mi_prim_read(int fd, void* buf, size_t bufsize) {
+  return read(fd,buf,bufsize);
+}
+static inline int mi_prim_close(int fd) {
+  return close(fd);
+}
+static inline int mi_prim_access(const char *fpath, int mode) {
+  return access(fpath,mode);
+}
+
+#endif
+
+
+
+//---------------------------------------------
+// init
+//---------------------------------------------
+
+static bool unix_detect_overcommit(void) {
+  bool os_overcommit = true;
+#if defined(__linux__)
+  int fd = mi_prim_open("/proc/sys/vm/overcommit_memory", O_RDONLY);
+	if (fd >= 0) {
+    char buf[32];
+    ssize_t nread = mi_prim_read(fd, &buf, sizeof(buf));
+    mi_prim_close(fd);
+    // <https://www.kernel.org/doc/Documentation/vm/overcommit-accounting>
+    // 0: heuristic overcommit, 1: always overcommit, 2: never overcommit (ignore NORESERVE)
+    if (nread >= 1) {
+      os_overcommit = (buf[0] == '0' || buf[0] == '1');
+    }
+  }
+#elif defined(__FreeBSD__)
+  int val = 0;
+  size_t olen = sizeof(val);
+  if (sysctlbyname("vm.overcommit", &val, &olen, NULL, 0) == 0) {
+    os_overcommit = (val != 0);
+  }
+#else
+  // default: overcommit is true
+#endif
+  return os_overcommit;
+}
+
+void _mi_prim_mem_init( mi_os_mem_config_t* config )
+{
+  long psize = sysconf(_SC_PAGESIZE);
+  if (psize > 0) {
+    config->page_size = (size_t)psize;
+    config->alloc_granularity = (size_t)psize;
+  }
+  config->large_page_size = 2*MI_MiB; // TODO: can we query the OS for this?
+  config->has_overcommit = unix_detect_overcommit();
+  config->has_partial_free = true;    // mmap can free in parts
+  config->has_virtual_reserve = true; // todo: check if this true for NetBSD?  (for anonymous mmap with PROT_NONE)
+
+  // disable transparent huge pages for this process?
+  #if (defined(__linux__) || defined(__ANDROID__)) && defined(PR_GET_THP_DISABLE)
+  #if defined(MI_NO_THP)
+  if (true)
+  #else
+  if (!mi_option_is_enabled(mi_option_allow_large_os_pages)) // disable THP also if large OS pages are not allowed in the options
+  #endif
+  {
+    int val = 0;
+    if (prctl(PR_GET_THP_DISABLE, &val, 0, 0, 0) != 0) {
+      // Most likely since distros often come with always/madvise settings.
+      val = 1;
+      // Disabling only for mimalloc process rather than touching system wide settings
+      (void)prctl(PR_SET_THP_DISABLE, &val, 0, 0, 0);
+    }
+  }
+  #endif
+}
+
+
+//---------------------------------------------
+// free
+//---------------------------------------------
+
+int _mi_prim_free(void* addr, size_t size ) {
+  bool err = (munmap(addr, size) == -1);
+  return (err ? errno : 0);
+}
+
+
+//---------------------------------------------
+// mmap
+//---------------------------------------------
+
+static int unix_madvise(void* addr, size_t size, int advice) {
+  #if defined(__sun)
+  return madvise((caddr_t)addr, size, advice);  // Solaris needs cast (issue #520)
+  #else
+  return madvise(addr, size, advice);
+  #endif
+}
+
+static void* unix_mmap_prim(void* addr, size_t size, size_t try_alignment, int protect_flags, int flags, int fd) {
+  MI_UNUSED(try_alignment);
+  void* p = NULL;
+  #if defined(MAP_ALIGNED)  // BSD
+  if (addr == NULL && try_alignment > 1 && (try_alignment % _mi_os_page_size()) == 0) {
+    size_t n = mi_bsr(try_alignment);
+    if (((size_t)1 << n) == try_alignment && n >= 12 && n <= 30) {  // alignment is a power of 2 and 4096 <= alignment <= 1GiB
+      p = mmap(addr, size, protect_flags, flags | MAP_ALIGNED(n), fd, 0);
+      if (p==MAP_FAILED || !_mi_is_aligned(p,try_alignment)) {
+        int err = errno;
+        _mi_trace_message("unable to directly request aligned OS memory (error: %d (0x%x), size: 0x%zx bytes, alignment: 0x%zx, hint address: %p)\n", err, err, size, try_alignment, addr);
+      }
+      if (p!=MAP_FAILED) return p;
+      // fall back to regular mmap
+    }
+  }
+  #elif defined(MAP_ALIGN)  // Solaris
+  if (addr == NULL && try_alignment > 1 && (try_alignment % _mi_os_page_size()) == 0) {
+    p = mmap((void*)try_alignment, size, protect_flags, flags | MAP_ALIGN, fd, 0);  // addr parameter is the required alignment
+    if (p!=MAP_FAILED) return p;
+    // fall back to regular mmap
+  }
+  #endif
+  #if (MI_INTPTR_SIZE >= 8) && !defined(MAP_ALIGNED)
+  // on 64-bit systems, use the virtual address area after 2TiB for 4MiB aligned allocations
+  if (addr == NULL) {
+    void* hint = _mi_os_get_aligned_hint(try_alignment, size);
+    if (hint != NULL) {
+      p = mmap(hint, size, protect_flags, flags, fd, 0);
+      if (p==MAP_FAILED || !_mi_is_aligned(p,try_alignment)) {
+        #if MI_TRACK_ENABLED  // asan sometimes does not instrument errno correctly?
+        int err = 0;
+        #else
+        int err = errno;
+        #endif
+        _mi_trace_message("unable to directly request hinted aligned OS memory (error: %d (0x%x), size: 0x%zx bytes, alignment: 0x%zx, hint address: %p)\n", err, err, size, try_alignment, hint);
+      }
+      if (p!=MAP_FAILED) return p;
+      // fall back to regular mmap
+    }
+  }
+  #endif
+  // regular mmap
+  p = mmap(addr, size, protect_flags, flags, fd, 0);
+  if (p!=MAP_FAILED) return p;
+  // failed to allocate
+  return NULL;
+}
+
+static int unix_mmap_fd(void) {
+  #if defined(VM_MAKE_TAG)
+  // macOS: tracking anonymous page with a specific ID. (All up to 98 are taken officially but LLVM sanitizers had taken 99)
+  int os_tag = (int)mi_option_get(mi_option_os_tag);
+  if (os_tag < 100 || os_tag > 255) { os_tag = 100; }
+  return VM_MAKE_TAG(os_tag);
+  #else
+  return -1;
+  #endif
+}
+
+static void* unix_mmap(void* addr, size_t size, size_t try_alignment, int protect_flags, bool large_only, bool allow_large, bool* is_large) {
+  #if !defined(MAP_ANONYMOUS)
+  #define MAP_ANONYMOUS  MAP_ANON
+  #endif
+  #if !defined(MAP_NORESERVE)
+  #define MAP_NORESERVE  0
+  #endif
+  void* p = NULL;
+  const int fd = unix_mmap_fd();
+  int flags = MAP_PRIVATE | MAP_ANONYMOUS;
+  if (_mi_os_has_overcommit()) {
+    flags |= MAP_NORESERVE;
+  }
+  #if defined(PROT_MAX)
+  protect_flags |= PROT_MAX(PROT_READ | PROT_WRITE); // BSD
+  #endif
+  // huge page allocation
+  if ((large_only || _mi_os_use_large_page(size, try_alignment)) && allow_large) {
+    static _Atomic(size_t) large_page_try_ok; // = 0;
+    size_t try_ok = mi_atomic_load_acquire(&large_page_try_ok);
+    if (!large_only && try_ok > 0) {
+      // If the OS is not configured for large OS pages, or the user does not have
+      // enough permission, the `mmap` will always fail (but it might also fail for other reasons).
+      // Therefore, once a large page allocation failed, we don't try again for `large_page_try_ok` times
+      // to avoid too many failing calls to mmap.
+      mi_atomic_cas_strong_acq_rel(&large_page_try_ok, &try_ok, try_ok - 1);
+    }
+    else {
+      int lflags = flags & ~MAP_NORESERVE;  // using NORESERVE on huge pages seems to fail on Linux
+      int lfd = fd;
+      #ifdef MAP_ALIGNED_SUPER
+      lflags |= MAP_ALIGNED_SUPER;
+      #endif
+      #ifdef MAP_HUGETLB
+      lflags |= MAP_HUGETLB;
+      #endif
+      #ifdef MAP_HUGE_1GB
+      static bool mi_huge_pages_available = true;
+      if ((size % MI_GiB) == 0 && mi_huge_pages_available) {
+        lflags |= MAP_HUGE_1GB;
+      }
+      else
+      #endif
+      {
+        #ifdef MAP_HUGE_2MB
+        lflags |= MAP_HUGE_2MB;
+        #endif
+      }
+      #ifdef VM_FLAGS_SUPERPAGE_SIZE_2MB
+      lfd |= VM_FLAGS_SUPERPAGE_SIZE_2MB;
+      #endif
+      if (large_only || lflags != flags) {
+        // try large OS page allocation
+        *is_large = true;
+        p = unix_mmap_prim(addr, size, try_alignment, protect_flags, lflags, lfd);
+        #ifdef MAP_HUGE_1GB
+        if (p == NULL && (lflags & MAP_HUGE_1GB) == MAP_HUGE_1GB) {
+          mi_huge_pages_available = false; // don't try huge 1GiB pages again
+          _mi_warning_message("unable to allocate huge (1GiB) page, trying large (2MiB) pages instead (errno: %i)\n", errno);
+          lflags = ((lflags & ~MAP_HUGE_1GB) | MAP_HUGE_2MB);
+          p = unix_mmap_prim(addr, size, try_alignment, protect_flags, lflags, lfd);
+        }
+        #endif
+        if (large_only) return p;
+        if (p == NULL) {
+          mi_atomic_store_release(&large_page_try_ok, (size_t)8);  // on error, don't try again for the next N allocations
+        }
+      }
+    }
+  }
+  // regular allocation
+  if (p == NULL) {
+    *is_large = false;
+    p = unix_mmap_prim(addr, size, try_alignment, protect_flags, flags, fd);
+    if (p != NULL) {
+      #if defined(MADV_HUGEPAGE)
+      // Many Linux systems don't allow MAP_HUGETLB but they support instead
+      // transparent huge pages (THP). Generally, it is not required to call `madvise` with MADV_HUGE
+      // though since properly aligned allocations will already use large pages if available
+      // in that case -- in particular for our large regions (in `memory.c`).
+      // However, some systems only allow THP if called with explicit `madvise`, so
+      // when large OS pages are enabled for mimalloc, we call `madvise` anyways.
+      if (allow_large && _mi_os_use_large_page(size, try_alignment)) {
+        if (unix_madvise(p, size, MADV_HUGEPAGE) == 0) {
+          *is_large = true; // possibly
+        };
+      }
+      #elif defined(__sun)
+      if (allow_large && _mi_os_use_large_page(size, try_alignment)) {
+        struct memcntl_mha cmd = {0};
+        cmd.mha_pagesize = _mi_os_large_page_size();
+        cmd.mha_cmd = MHA_MAPSIZE_VA;
+        if (memcntl((caddr_t)p, size, MC_HAT_ADVISE, (caddr_t)&cmd, 0, 0) == 0) {
+          *is_large = true;
+        }
+      }
+      #endif
+    }
+  }
+  return p;
+}
+
+// Note: the `try_alignment` is just a hint and the returned pointer is not guaranteed to be aligned.
+int _mi_prim_alloc(size_t size, size_t try_alignment, bool commit, bool allow_large, bool* is_large, bool* is_zero, void** addr) {
+  mi_assert_internal(size > 0 && (size % _mi_os_page_size()) == 0);
+  mi_assert_internal(commit || !allow_large);
+  mi_assert_internal(try_alignment > 0);
+
+  *is_zero = true;
+  int protect_flags = (commit ? (PROT_WRITE | PROT_READ) : PROT_NONE);
+  *addr = unix_mmap(NULL, size, try_alignment, protect_flags, false, allow_large, is_large);
+  return (*addr != NULL ? 0 : errno);
+}
+
+
+//---------------------------------------------
+// Commit/Reset
+//---------------------------------------------
+
+static void unix_mprotect_hint(int err) {
+  #if defined(__linux__) && (MI_SECURE>=2) // guard page around every mimalloc page
+  if (err == ENOMEM) {
+    _mi_warning_message("The next warning may be caused by a low memory map limit.\n"
+                        "  On Linux this is controlled by the vm.max_map_count -- maybe increase it?\n"
+                        "  For example: sudo sysctl -w vm.max_map_count=262144\n");
+  }
+  #else
+  MI_UNUSED(err);
+  #endif
+}
+
+
+
+
+
+int _mi_prim_commit(void* start, size_t size, bool* is_zero) {
+  // commit: ensure we can access the area
+  // note: we may think that *is_zero can be true since the memory
+  // was either from mmap PROT_NONE, or from decommit MADV_DONTNEED, but
+  // we sometimes call commit on a range with still partially committed
+  // memory and `mprotect` does not zero the range.
+  *is_zero = false;
+  int err = mprotect(start, size, (PROT_READ | PROT_WRITE));
+  if (err != 0) {
+    err = errno;
+    unix_mprotect_hint(err);
+  }
+  return err;
+}
+
+int _mi_prim_decommit(void* start, size_t size, bool* needs_recommit) {
+  int err = 0;
+  // decommit: use MADV_DONTNEED as it decreases rss immediately (unlike MADV_FREE)
+  err = unix_madvise(start, size, MADV_DONTNEED);
+  #if !MI_DEBUG && !MI_SECURE
+    *needs_recommit = false;
+  #else
+    *needs_recommit = true;
+    mprotect(start, size, PROT_NONE);
+  #endif
+  /*
+  // decommit: use mmap with MAP_FIXED and PROT_NONE to discard the existing memory (and reduce rss)
+  *needs_recommit = true;
+  const int fd = unix_mmap_fd();
+  void* p = mmap(start, size, PROT_NONE, (MAP_FIXED | MAP_PRIVATE | MAP_ANONYMOUS | MAP_NORESERVE), fd, 0);
+  if (p != start) { err = errno; }
+  */
+  return err;
+}
+
+int _mi_prim_reset(void* start, size_t size) {
+  // We try to use `MADV_FREE` as that is the fastest. A drawback though is that it
+  // will not reduce the `rss` stats in tools like `top` even though the memory is available
+  // to other processes. With the default `MIMALLOC_PURGE_DECOMMITS=1` we ensure that by
+  // default `MADV_DONTNEED` is used though.
+  #if defined(MADV_FREE)
+  static _Atomic(size_t) advice = MI_ATOMIC_VAR_INIT(MADV_FREE);
+  int oadvice = (int)mi_atomic_load_relaxed(&advice);
+  int err;
+  while ((err = unix_madvise(start, size, oadvice)) != 0 && errno == EAGAIN) { errno = 0;  };
+  if (err != 0 && errno == EINVAL && oadvice == MADV_FREE) {
+    // if MADV_FREE is not supported, fall back to MADV_DONTNEED from now on
+    mi_atomic_store_release(&advice, (size_t)MADV_DONTNEED);
+    err = unix_madvise(start, size, MADV_DONTNEED);
+  }
+  #else
+  int err = unix_madvise(start, size, MADV_DONTNEED);
+  #endif
+  return err;
+}
+
+int _mi_prim_protect(void* start, size_t size, bool protect) {
+  int err = mprotect(start, size, protect ? PROT_NONE : (PROT_READ | PROT_WRITE));
+  if (err != 0) { err = errno; }
+  unix_mprotect_hint(err);
+  return err;
+}
+
+
+
+//---------------------------------------------
+// Huge page allocation
+//---------------------------------------------
+
+#if (MI_INTPTR_SIZE >= 8) && !defined(__HAIKU__) && !defined(__CYGWIN__)
+
+#ifndef MPOL_PREFERRED
+#define MPOL_PREFERRED 1
+#endif
+
+#if defined(MI_HAS_SYSCALL_H) && defined(SYS_mbind)
+static long mi_prim_mbind(void* start, unsigned long len, unsigned long mode, const unsigned long* nmask, unsigned long maxnode, unsigned flags) {
+  return syscall(SYS_mbind, start, len, mode, nmask, maxnode, flags);
+}
+#else
+static long mi_prim_mbind(void* start, unsigned long len, unsigned long mode, const unsigned long* nmask, unsigned long maxnode, unsigned flags) {
+  MI_UNUSED(start); MI_UNUSED(len); MI_UNUSED(mode); MI_UNUSED(nmask); MI_UNUSED(maxnode); MI_UNUSED(flags);
+  return 0;
+}
+#endif
+
+int _mi_prim_alloc_huge_os_pages(void* hint_addr, size_t size, int numa_node, bool* is_zero, void** addr) {
+  bool is_large = true;
+  *is_zero = true;
+  *addr = unix_mmap(hint_addr, size, MI_SEGMENT_SIZE, PROT_READ | PROT_WRITE, true, true, &is_large);
+  if (*addr != NULL && numa_node >= 0 && numa_node < 8*MI_INTPTR_SIZE) { // at most 64 nodes
+    unsigned long numa_mask = (1UL << numa_node);
+    // TODO: does `mbind` work correctly for huge OS pages? should we
+    // use `set_mempolicy` before calling mmap instead?
+    // see: <https://lkml.org/lkml/2017/2/9/875>
+    long err = mi_prim_mbind(*addr, size, MPOL_PREFERRED, &numa_mask, 8*MI_INTPTR_SIZE, 0);
+    if (err != 0) {
+      err = errno;
+      _mi_warning_message("failed to bind huge (1GiB) pages to numa node %d (error: %d (0x%x))\n", numa_node, err, err);
+    }
+  }
+  return (*addr != NULL ? 0 : errno);
+}
+
+#else
+
+int _mi_prim_alloc_huge_os_pages(void* hint_addr, size_t size, int numa_node, bool* is_zero, void** addr) {
+  MI_UNUSED(hint_addr); MI_UNUSED(size); MI_UNUSED(numa_node);
+  *is_zero = false;
+  *addr = NULL;
+  return ENOMEM;
+}
+
+#endif
+
+//---------------------------------------------
+// NUMA nodes
+//---------------------------------------------
+
+#if defined(__linux__)
+
+size_t _mi_prim_numa_node(void) {
+  #if defined(MI_HAS_SYSCALL_H) && defined(SYS_getcpu)
+    unsigned long node = 0;
+    unsigned long ncpu = 0;
+    long err = syscall(SYS_getcpu, &ncpu, &node, NULL);
+    if (err != 0) return 0;
+    return node;
+  #else
+    return 0;
+  #endif
+}
+
+size_t _mi_prim_numa_node_count(void) {
+  char buf[128];
+  unsigned node = 0;
+  for(node = 0; node < 256; node++) {
+    // enumerate node entries -- todo: it there a more efficient way to do this? (but ensure there is no allocation)
+    _mi_snprintf(buf, 127, "/sys/devices/system/node/node%u", node + 1);
+    if (mi_prim_access(buf,R_OK) != 0) break;
+  }
+  return (node+1);
+}
+
+#elif defined(__FreeBSD__) && __FreeBSD_version >= 1200000
+
+size_t _mi_prim_numa_node(void) {
+  domainset_t dom;
+  size_t node;
+  int policy;
+  if (cpuset_getdomain(CPU_LEVEL_CPUSET, CPU_WHICH_PID, -1, sizeof(dom), &dom, &policy) == -1) return 0ul;
+  for (node = 0; node < MAXMEMDOM; node++) {
+    if (DOMAINSET_ISSET(node, &dom)) return node;
+  }
+  return 0ul;
+}
+
+size_t _mi_prim_numa_node_count(void) {
+  size_t ndomains = 0;
+  size_t len = sizeof(ndomains);
+  if (sysctlbyname("vm.ndomains", &ndomains, &len, NULL, 0) == -1) return 0ul;
+  return ndomains;
+}
+
+#elif defined(__DragonFly__)
+
+size_t _mi_prim_numa_node(void) {
+  // TODO: DragonFly does not seem to provide any userland means to get this information.
+  return 0ul;
+}
+
+size_t _mi_prim_numa_node_count(void) {
+  size_t ncpus = 0, nvirtcoresperphys = 0;
+  size_t len = sizeof(size_t);
+  if (sysctlbyname("hw.ncpu", &ncpus, &len, NULL, 0) == -1) return 0ul;
+  if (sysctlbyname("hw.cpu_topology_ht_ids", &nvirtcoresperphys, &len, NULL, 0) == -1) return 0ul;
+  return nvirtcoresperphys * ncpus;
+}
+
+#else
+
+size_t _mi_prim_numa_node(void) {
+  return 0;
+}
+
+size_t _mi_prim_numa_node_count(void) {
+  return 1;
+}
+
+#endif
+
+// ----------------------------------------------------------------
+// Clock
+// ----------------------------------------------------------------
+
+#include <time.h>
+
+#if defined(CLOCK_REALTIME) || defined(CLOCK_MONOTONIC)
+
+mi_msecs_t _mi_prim_clock_now(void) {
+  struct timespec t;
+  #ifdef CLOCK_MONOTONIC
+  clock_gettime(CLOCK_MONOTONIC, &t);
+  #else
+  clock_gettime(CLOCK_REALTIME, &t);
+  #endif
+  return ((mi_msecs_t)t.tv_sec * 1000) + ((mi_msecs_t)t.tv_nsec / 1000000);
+}
+
+#else
+
+// low resolution timer
+mi_msecs_t _mi_prim_clock_now(void) {
+  #if !defined(CLOCKS_PER_SEC) || (CLOCKS_PER_SEC == 1000) || (CLOCKS_PER_SEC == 0)
+  return (mi_msecs_t)clock();
+  #elif (CLOCKS_PER_SEC < 1000)
+  return (mi_msecs_t)clock() * (1000 / (mi_msecs_t)CLOCKS_PER_SEC);
+  #else
+  return (mi_msecs_t)clock() / ((mi_msecs_t)CLOCKS_PER_SEC / 1000);
+  #endif
+}
+
+#endif
+
+
+
+
+//----------------------------------------------------------------
+// Process info
+//----------------------------------------------------------------
+
+#if defined(__unix__) || defined(__unix) || defined(unix) || defined(__APPLE__) || defined(__HAIKU__)
+#include <stdio.h>
+#include <unistd.h>
+#include <sys/resource.h>
+
+#if defined(__APPLE__)
+#include <mach/mach.h>
+#endif
+
+#if defined(__HAIKU__)
+#error #include <kernel/OS.h>
+#endif
+
+static mi_msecs_t timeval_secs(const struct timeval* tv) {
+  return ((mi_msecs_t)tv->tv_sec * 1000L) + ((mi_msecs_t)tv->tv_usec / 1000L);
+}
+
+void _mi_prim_process_info(mi_process_info_t* pinfo)
+{
+  struct rusage rusage;
+  getrusage(RUSAGE_SELF, &rusage);
+  pinfo->utime = timeval_secs(&rusage.ru_utime);
+  pinfo->stime = timeval_secs(&rusage.ru_stime);
+#if !defined(__HAIKU__)
+  pinfo->page_faults = rusage.ru_majflt;
+#endif
+#if defined(__HAIKU__)
+  // Haiku does not have (yet?) a way to
+  // get these stats per process
+  thread_info tid;
+  area_info mem;
+  ssize_t c;
+  get_thread_info(find_thread(0), &tid);
+  while (get_next_area_info(tid.team, &c, &mem) == B_OK) {
+    pinfo->peak_rss += mem.ram_size;
+  }
+  pinfo->page_faults = 0;
+#elif defined(__APPLE__)
+  pinfo->peak_rss = rusage.ru_maxrss;         // macos reports in bytes
+  #ifdef MACH_TASK_BASIC_INFO
+  struct mach_task_basic_info info;
+  mach_msg_type_number_t infoCount = MACH_TASK_BASIC_INFO_COUNT;
+  if (task_info(mach_task_self(), MACH_TASK_BASIC_INFO, (task_info_t)&info, &infoCount) == KERN_SUCCESS) {
+    pinfo->current_rss = (size_t)info.resident_size;
+  }
+  #else
+  struct task_basic_info info;
+  mach_msg_type_number_t infoCount = TASK_BASIC_INFO_COUNT;
+  if (task_info(mach_task_self(), TASK_BASIC_INFO, (task_info_t)&info, &infoCount) == KERN_SUCCESS) {
+    pinfo->current_rss = (size_t)info.resident_size;
+  }
+  #endif
+#else
+  pinfo->peak_rss = rusage.ru_maxrss * 1024;  // Linux/BSD report in KiB
+#endif
+  // use defaults for commit
+}
+
+#else
+
+#ifndef __wasi__
+// WebAssembly instances are not processes
+#pragma message("define a way to get process info")
+#endif
+
+void _mi_prim_process_info(mi_process_info_t* pinfo)
+{
+  // use defaults
+  MI_UNUSED(pinfo);
+}
+
+#endif
+
+
+//----------------------------------------------------------------
+// Output
+//----------------------------------------------------------------
+
+void _mi_prim_out_stderr( const char* msg ) {
+  fputs(msg,stderr);
+}
+
+
+//----------------------------------------------------------------
+// Environment
+//----------------------------------------------------------------
+
+#if !defined(MI_USE_ENVIRON) || (MI_USE_ENVIRON!=0)
+// On Posix systemsr use `environ` to access environment variables
+// even before the C runtime is initialized.
+#if defined(__APPLE__) && defined(__has_include) && __has_include(<crt_externs.h>)
+#include <crt_externs.h>
+static char** mi_get_environ(void) {
+  return (*_NSGetEnviron());
+}
+#else
+extern char** environ;
+static char** mi_get_environ(void) {
+  return environ;
+}
+#endif
+bool _mi_prim_getenv(const char* name, char* result, size_t result_size) {
+  if (name==NULL) return false;
+  const size_t len = _mi_strlen(name);
+  if (len == 0) return false;
+  char** env = mi_get_environ();
+  if (env == NULL) return false;
+  // compare up to 10000 entries
+  for (int i = 0; i < 10000 && env[i] != NULL; i++) {
+    const char* s = env[i];
+    if (_mi_strnicmp(name, s, len) == 0 && s[len] == '=') { // case insensitive
+      // found it
+      _mi_strlcpy(result, s + len + 1, result_size);
+      return true;
+    }
+  }
+  return false;
+}
+#else
+// fallback: use standard C `getenv` but this cannot be used while initializing the C runtime
+bool _mi_prim_getenv(const char* name, char* result, size_t result_size) {
+  // cannot call getenv() when still initializing the C runtime.
+  if (_mi_preloading()) return false;
+  const char* s = getenv(name);
+  if (s == NULL) {
+    // we check the upper case name too.
+    char buf[64+1];
+    size_t len = _mi_strnlen(name,sizeof(buf)-1);
+    for (size_t i = 0; i < len; i++) {
+      buf[i] = _mi_toupper(name[i]);
+    }
+    buf[len] = 0;
+    s = getenv(buf);
+  }
+  if (s == NULL || _mi_strnlen(s,result_size) >= result_size)  return false;
+  _mi_strlcpy(result, s, result_size);
+  return true;
+}
+#endif  // !MI_USE_ENVIRON
+
+
+//----------------------------------------------------------------
+// Random
+//----------------------------------------------------------------
+
+#if defined(__APPLE__) && defined(MAC_OS_X_VERSION_10_15) && (MAC_OS_X_VERSION_MIN_REQUIRED >= MAC_OS_X_VERSION_10_15)
+#include <CommonCrypto/CommonCryptoError.h>
+#include <CommonCrypto/CommonRandom.h>
+
+bool _mi_prim_random_buf(void* buf, size_t buf_len) {
+  // We prefere CCRandomGenerateBytes as it returns an error code while arc4random_buf
+  // may fail silently on macOS. See PR #390, and <https://opensource.apple.com/source/Libc/Libc-1439.40.11/gen/FreeBSD/arc4random.c.auto.html>
+  return (CCRandomGenerateBytes(buf, buf_len) == kCCSuccess);
+}
+
+#elif defined(__ANDROID__) || defined(__DragonFly__) || \
+      defined(__FreeBSD__) || defined(__NetBSD__) || defined(__OpenBSD__) || \
+      defined(__sun) || \
+      (defined(__APPLE__) && (MAC_OS_X_VERSION_MIN_REQUIRED >= MAC_OS_X_VERSION_10_7))
+
+#include <stdlib.h>
+bool _mi_prim_random_buf(void* buf, size_t buf_len) {
+  arc4random_buf(buf, buf_len);
+  return true;
+}
+
+#elif defined(__APPLE__) || defined(__linux__) || defined(__HAIKU__)   // also for old apple versions < 10.7 (issue #829)
+
+#include <sys/types.h>
+#include <sys/stat.h>
+#include <errno.h>
+
+bool _mi_prim_random_buf(void* buf, size_t buf_len) {
+  // Modern Linux provides `getrandom` but different distributions either use `sys/random.h` or `linux/random.h`
+  // and for the latter the actual `getrandom` call is not always defined.
+  // (see <https://stackoverflow.com/questions/45237324/why-doesnt-getrandom-compile>)
+  // We therefore use a syscall directly and fall back dynamically to /dev/urandom when needed.
+  #if defined(MI_HAS_SYSCALL_H) && defined(SYS_getrandom)
+    #ifndef GRND_NONBLOCK
+    #define GRND_NONBLOCK (1)
+    #endif
+    static _Atomic(uintptr_t) no_getrandom; // = 0
+    if (mi_atomic_load_acquire(&no_getrandom)==0) {
+      ssize_t ret = syscall(SYS_getrandom, buf, buf_len, GRND_NONBLOCK);
+      if (ret >= 0) return (buf_len == (size_t)ret);
+      if (errno != ENOSYS) return false;
+      mi_atomic_store_release(&no_getrandom, (uintptr_t)1); // don't call again, and fall back to /dev/urandom
+    }
+  #endif
+  int flags = O_RDONLY;
+  #if defined(O_CLOEXEC)
+  flags |= O_CLOEXEC;
+  #endif
+  int fd = mi_prim_open("/dev/urandom", flags);
+  if (fd < 0) return false;
+  size_t count = 0;
+  while(count < buf_len) {
+    ssize_t ret = mi_prim_read(fd, (char*)buf + count, buf_len - count);
+    if (ret<=0) {
+      if (errno!=EAGAIN && errno!=EINTR) break;
+    }
+    else {
+      count += ret;
+    }
+  }
+  mi_prim_close(fd);
+  return (count==buf_len);
+}
+
+#else
+
+bool _mi_prim_random_buf(void* buf, size_t buf_len) {
+  return false;
+}
+
+#endif
+
+
+//----------------------------------------------------------------
+// Thread init/done
+//----------------------------------------------------------------
+
+#if defined(MI_USE_PTHREADS)
+
+// use pthread local storage keys to detect thread ending
+// (and used with MI_TLS_PTHREADS for the default heap)
+pthread_key_t _mi_heap_default_key = (pthread_key_t)(-1);
+
+static void mi_pthread_done(void* value) {
+  if (value!=NULL) {
+    _mi_thread_done((mi_heap_t*)value);
+  }
+}
+
+void _mi_prim_thread_init_auto_done(void) {
+  mi_assert_internal(_mi_heap_default_key == (pthread_key_t)(-1));
+  pthread_key_create(&_mi_heap_default_key, &mi_pthread_done);
+}
+
+void _mi_prim_thread_done_auto_done(void) {
+  if (_mi_heap_default_key != (pthread_key_t)(-1)) {  // do not leak the key, see issue #809
+    pthread_key_delete(_mi_heap_default_key);
+  }
+}
+
+void _mi_prim_thread_associate_default_heap(mi_heap_t* heap) {
+  if (_mi_heap_default_key != (pthread_key_t)(-1)) {  // can happen during recursive invocation on freeBSD
+    pthread_setspecific(_mi_heap_default_key, heap);
+  }
+}
+
+#else
+
+void _mi_prim_thread_init_auto_done(void) {
+  // nothing
+}
+
+void _mi_prim_thread_done_auto_done(void) {
+  // nothing
+}
+
+void _mi_prim_thread_associate_default_heap(mi_heap_t* heap) {
+  MI_UNUSED(heap);
+}
+
+#endif
diff --git a/contrib/libs/mimalloc/src/random.c b/contrib/libs/mimalloc/src/random.c
index 255bede4db34..4fc8b2f8fb0b 100644
--- a/contrib/libs/mimalloc/src/random.c
+++ b/contrib/libs/mimalloc/src/random.c
@@ -5,9 +5,9 @@ terms of the MIT license. A copy of the license can be found in the file
 "LICENSE" at the root of this distribution.
 -----------------------------------------------------------------------------*/
 #include "mimalloc.h"
-#include "mimalloc-internal.h"
-
-#include <string.h> // memset
+#include "mimalloc/internal.h"
+#include "mimalloc/prim.h"    // _mi_prim_random_buf
+#include <string.h>       // memset
 
 /* ----------------------------------------------------------------------------
 We use our own PRNG to keep predictable performance of random number generation
@@ -154,118 +154,13 @@ uintptr_t _mi_random_next(mi_random_ctx_t* ctx) {
 
 
 /* ----------------------------------------------------------------------------
-To initialize a fresh random context we rely on the OS:
-- Windows     : BCryptGenRandom (or RtlGenRandom)
-- osX,bsd,wasi: arc4random_buf
-- Linux       : getrandom,/dev/urandom
+To initialize a fresh random context.
 If we cannot get good randomness, we fall back to weak randomness based on a timer and ASLR.
 -----------------------------------------------------------------------------*/
 
-#if defined(_WIN32)
-
-#if !defined(MI_USE_RTLGENRANDOM)
-// We prefer BCryptGenRandom over RtlGenRandom
-#pragma comment (lib,"bcrypt.lib")
-#include <bcrypt.h>
-static bool os_random_buf(void* buf, size_t buf_len) {
-  return (BCryptGenRandom(NULL, (PUCHAR)buf, (ULONG)buf_len, BCRYPT_USE_SYSTEM_PREFERRED_RNG) >= 0);
-}
-#else
-// Use (unofficial) RtlGenRandom
-#pragma comment (lib,"advapi32.lib")
-#define RtlGenRandom  SystemFunction036
-#ifdef __cplusplus
-extern "C" {
-#endif
-BOOLEAN NTAPI RtlGenRandom(PVOID RandomBuffer, ULONG RandomBufferLength);
-#ifdef __cplusplus
-}
-#endif
-static bool os_random_buf(void* buf, size_t buf_len) {
-  return (RtlGenRandom(buf, (ULONG)buf_len) != 0);
-}
-#endif
-
-#elif defined(ANDROID) || defined(XP_DARWIN) || defined(__APPLE__) || defined(__DragonFly__) || \
-      defined(__FreeBSD__) || defined(__NetBSD__) || defined(__OpenBSD__) || \
-      defined(__sun) || defined(__wasi__)
-#include <stdlib.h>
-static bool os_random_buf(void* buf, size_t buf_len) {
-  arc4random_buf(buf, buf_len);
-  return true;
-}
-#elif defined(__linux__)
-#include <sys/syscall.h>
-#include <unistd.h>
-#include <sys/types.h>
-#include <sys/stat.h>
-#include <fcntl.h>
-#include <errno.h>
-static bool os_random_buf(void* buf, size_t buf_len) {
-  // Modern Linux provides `getrandom` but different distributions either use `sys/random.h` or `linux/random.h`
-  // and for the latter the actual `getrandom` call is not always defined.
-  // (see <https://stackoverflow.com/questions/45237324/why-doesnt-getrandom-compile>)
-  // We therefore use a syscall directly and fall back dynamically to /dev/urandom when needed.
-#ifdef SYS_getrandom
-  #ifndef GRND_NONBLOCK
-  #define GRND_NONBLOCK (1)
-  #endif
-  static _Atomic(uintptr_t) no_getrandom; // = 0
-  if (mi_atomic_load_acquire(&no_getrandom)==0) {
-    ssize_t ret = syscall(SYS_getrandom, buf, buf_len, GRND_NONBLOCK);
-    if (ret >= 0) return (buf_len == (size_t)ret);
-    if (ret != ENOSYS) return false;
-    mi_atomic_store_release(&no_getrandom, 1UL); // don't call again, and fall back to /dev/urandom
-  }
-#endif
-  int flags = O_RDONLY;
-  #if defined(O_CLOEXEC)
-  flags |= O_CLOEXEC;
-  #endif
-  int fd = open("/dev/urandom", flags, 0);
-  if (fd < 0) return false;
-  size_t count = 0;
-  while(count < buf_len) {
-    ssize_t ret = read(fd, (char*)buf + count, buf_len - count);
-    if (ret<=0) {
-      if (errno!=EAGAIN && errno!=EINTR) break;
-    }
-    else {
-      count += ret;
-    }
-  }
-  close(fd);
-  return (count==buf_len);
-}
-#else
-static bool os_random_buf(void* buf, size_t buf_len) {
-  return false;
-}
-#endif
-
-#if defined(_WIN32)
-#include <windows.h>
-#elif defined(__APPLE__)
-#include <mach/mach_time.h>
-#else
-#include <time.h>
-#endif
-
-uintptr_t _os_random_weak(uintptr_t extra_seed) {
-  uintptr_t x = (uintptr_t)&_os_random_weak ^ extra_seed; // ASLR makes the address random
-  
-  #if defined(_WIN32)
-    LARGE_INTEGER pcount;
-    QueryPerformanceCounter(&pcount);
-    x ^= (uintptr_t)(pcount.QuadPart);
-  #elif defined(__APPLE__)
-    x ^= (uintptr_t)mach_absolute_time();
-  #else
-    struct timespec time;
-    clock_gettime(CLOCK_MONOTONIC, &time);
-    x ^= (uintptr_t)time.tv_sec;
-    x ^= (uintptr_t)time.tv_nsec;
-  #endif
+uintptr_t _mi_os_random_weak(uintptr_t extra_seed) {
+  uintptr_t x = (uintptr_t)&_mi_os_random_weak ^ extra_seed; // ASLR makes the address random
+  x ^= _mi_prim_clock_now();  
   // and do a few randomization steps
   uintptr_t max = ((x ^ (x >> 17)) & 0x0F) + 1;
   for (uintptr_t i = 0; i < max; i++) {
@@ -275,21 +170,41 @@ uintptr_t _os_random_weak(uintptr_t extra_seed) {
   return x;
 }
 
-void _mi_random_init(mi_random_ctx_t* ctx) {
+static void mi_random_init_ex(mi_random_ctx_t* ctx, bool use_weak) {
   uint8_t key[32];
-  if (!os_random_buf(key, sizeof(key))) {
+  if (use_weak || !_mi_prim_random_buf(key, sizeof(key))) {
     // if we fail to get random data from the OS, we fall back to a
     // weak random source based on the current time
-    _mi_warning_message("unable to use secure randomness\n");
-    uintptr_t x = _os_random_weak(0);
+    #if !defined(__wasi__)
+    if (!use_weak) { _mi_warning_message("unable to use secure randomness\n"); }
+    #endif
+    uintptr_t x = _mi_os_random_weak(0);
     for (size_t i = 0; i < 8; i++) {  // key is eight 32-bit words.
       x = _mi_random_shuffle(x);
       ((uint32_t*)key)[i] = (uint32_t)x;
     }
+    ctx->weak = true;
+  }
+  else {
+    ctx->weak = false;
   }
   chacha_init(ctx, key, (uintptr_t)ctx /*nonce*/ );
 }
 
+void _mi_random_init(mi_random_ctx_t* ctx) {
+  mi_random_init_ex(ctx, false);
+}
+
+void _mi_random_init_weak(mi_random_ctx_t * ctx) {
+  mi_random_init_ex(ctx, true);
+}
+
+void _mi_random_reinit_if_weak(mi_random_ctx_t * ctx) {
+  if (ctx->weak) {
+    _mi_random_init(ctx);
+  }
+}
+
 /* --------------------------------------------------------
 test vectors from <https://tools.ietf.org/html/rfc8439>
 ----------------------------------------------------------- */
diff --git a/contrib/libs/mimalloc/src/region.c b/contrib/libs/mimalloc/src/region.c
deleted file mode 100644
index 7954073099c0..000000000000
--- a/contrib/libs/mimalloc/src/region.c
+++ /dev/null
@@ -1,505 +0,0 @@
-/* ----------------------------------------------------------------------------
-Copyright (c) 2019-2020, Microsoft Research, Daan Leijen
-This is free software; you can redistribute it and/or modify it under the
-terms of the MIT license. A copy of the license can be found in the file
-"LICENSE" at the root of this distribution.
------------------------------------------------------------------------------*/
-
-/* ----------------------------------------------------------------------------
-This implements a layer between the raw OS memory (VirtualAlloc/mmap/sbrk/..)
-and the segment and huge object allocation by mimalloc. There may be multiple
-implementations of this (one could be the identity going directly to the OS,
-another could be a simple cache etc), but the current one uses large "regions".
-In contrast to the rest of mimalloc, the "regions" are shared between threads and
-need to be accessed using atomic operations.
-We need this memory layer between the raw OS calls because of:
-1. on `sbrk` like systems (like WebAssembly) we need our own memory maps in order
-   to reuse memory effectively.
-2. It turns out that for large objects, between 1MiB and 32MiB (?), the cost of
-   an OS allocation/free is still (much) too expensive relative to the accesses 
-   in that object :-( (`malloc-large` tests this). This means we need a cheaper 
-   way to reuse memory.
-3. This layer allows for NUMA aware allocation.
-
-Possible issues:
-- (2) can potentially be addressed too with a small cache per thread which is much
-  simpler. Generally though that requires shrinking of huge pages, and may overuse
-  memory per thread. (and is not compatible with `sbrk`).
-- Since the current regions are per-process, we need atomic operations to
-  claim blocks which may be contended
-- In the worst case, we need to search the whole region map (16KiB for 256GiB)
-  linearly. At what point will direct OS calls be faster? Is there a way to
-  do this better without adding too much complexity?
------------------------------------------------------------------------------*/
-#include "mimalloc.h"
-#include "mimalloc-internal.h"
-#include "mimalloc-atomic.h"
-
-#include <string.h>  // memset
-
-#include "bitmap.h"
-
-// Internal raw OS interface
-size_t  _mi_os_large_page_size();
-bool    _mi_os_protect(void* addr, size_t size);
-bool    _mi_os_unprotect(void* addr, size_t size);
-bool    _mi_os_commit(void* p, size_t size, bool* is_zero, mi_stats_t* stats);
-bool    _mi_os_decommit(void* p, size_t size, mi_stats_t* stats);
-bool    _mi_os_reset(void* p, size_t size, mi_stats_t* stats);
-bool    _mi_os_unreset(void* p, size_t size, bool* is_zero, mi_stats_t* stats);
-
-// arena.c
-void    _mi_arena_free(void* p, size_t size, size_t memid, bool all_committed, mi_stats_t* stats);
-void*   _mi_arena_alloc(size_t size, bool* commit, bool* large, bool* is_pinned, bool* is_zero, size_t* memid, mi_os_tld_t* tld);
-void*   _mi_arena_alloc_aligned(size_t size, size_t alignment, bool* commit, bool* large, bool* is_pinned, bool* is_zero, size_t* memid, mi_os_tld_t* tld);
-
-
-
-// Constants
-#if (MI_INTPTR_SIZE==8)
-#define MI_HEAP_REGION_MAX_SIZE    (256 * GiB)  // 64KiB for the region map 
-#elif (MI_INTPTR_SIZE==4)
-#define MI_HEAP_REGION_MAX_SIZE    (3 * GiB)    // ~ KiB for the region map
-#else
-#error "define the maximum heap space allowed for regions on this platform"
-#endif
-
-#define MI_SEGMENT_ALIGN          MI_SEGMENT_SIZE
-
-#define MI_REGION_MAX_BLOCKS      MI_BITMAP_FIELD_BITS
-#define MI_REGION_SIZE            (MI_SEGMENT_SIZE * MI_BITMAP_FIELD_BITS)    // 256MiB  (64MiB on 32 bits)
-#define MI_REGION_MAX             (MI_HEAP_REGION_MAX_SIZE / MI_REGION_SIZE)  // 1024  (48 on 32 bits)
-#define MI_REGION_MAX_OBJ_BLOCKS  (MI_REGION_MAX_BLOCKS/4)                    // 64MiB
-#define MI_REGION_MAX_OBJ_SIZE    (MI_REGION_MAX_OBJ_BLOCKS*MI_SEGMENT_SIZE)  
-
-// Region info 
-typedef union mi_region_info_u {
-  uintptr_t value;      
-  struct {
-    bool  valid;        // initialized?
-    bool  is_large:1;   // allocated in fixed large/huge OS pages
-    bool  is_pinned:1;  // pinned memory cannot be decommitted
-    short numa_node;    // the associated NUMA node (where -1 means no associated node)
-  } x;
-} mi_region_info_t;
-
-
-// A region owns a chunk of REGION_SIZE (256MiB) (virtual) memory with
-// a bit map with one bit per MI_SEGMENT_SIZE (4MiB) block.
-typedef struct mem_region_s {
-  _Atomic(uintptr_t)        info;        // mi_region_info_t.value
-  _Atomic(void*)            start;       // start of the memory area 
-  mi_bitmap_field_t         in_use;      // bit per in-use block
-  mi_bitmap_field_t         dirty;       // track if non-zero per block
-  mi_bitmap_field_t         commit;      // track if committed per block
-  mi_bitmap_field_t         reset;       // track if reset per block
-  _Atomic(uintptr_t)        arena_memid; // if allocated from a (huge page) arena
-  uintptr_t                 padding;     // round to 8 fields
-} mem_region_t;
-
-// The region map
-static mem_region_t regions[MI_REGION_MAX];
-
-// Allocated regions
-static _Atomic(uintptr_t) regions_count; // = 0;        
-
-
-/* ----------------------------------------------------------------------------
-Utility functions
------------------------------------------------------------------------------*/
-
-// Blocks (of 4MiB) needed for the given size.
-static size_t mi_region_block_count(size_t size) {
-  return _mi_divide_up(size, MI_SEGMENT_SIZE);
-}
-
-/*
-// Return a rounded commit/reset size such that we don't fragment large OS pages into small ones.
-static size_t mi_good_commit_size(size_t size) {
-  if (size > (SIZE_MAX - _mi_os_large_page_size())) return size;
-  return _mi_align_up(size, _mi_os_large_page_size());
-}
-*/
-
-// Return if a pointer points into a region reserved by us.
-bool mi_is_in_heap_region(const void* p) mi_attr_noexcept {
-  if (p==NULL) return false;
-  size_t count = mi_atomic_load_relaxed(&regions_count);
-  for (size_t i = 0; i < count; i++) {
-    uint8_t* start = (uint8_t*)mi_atomic_load_ptr_relaxed(uint8_t, &regions[i].start);
-    if (start != NULL && (uint8_t*)p >= start && (uint8_t*)p < start + MI_REGION_SIZE) return true;
-  }
-  return false;
-}
-
-
-static void* mi_region_blocks_start(const mem_region_t* region, mi_bitmap_index_t bit_idx) {
-  uint8_t* start = (uint8_t*)mi_atomic_load_ptr_acquire(uint8_t, &((mem_region_t*)region)->start);
-  mi_assert_internal(start != NULL);
-  return (start + (bit_idx * MI_SEGMENT_SIZE));  
-}
-
-static size_t mi_memid_create(mem_region_t* region, mi_bitmap_index_t bit_idx) {
-  mi_assert_internal(bit_idx < MI_BITMAP_FIELD_BITS);
-  size_t idx = region - regions;
-  mi_assert_internal(&regions[idx] == region);
-  return (idx*MI_BITMAP_FIELD_BITS + bit_idx)<<1;
-}
-
-static size_t mi_memid_create_from_arena(size_t arena_memid) {
-  return (arena_memid << 1) | 1;
-}
-
-
-static bool mi_memid_is_arena(size_t id, mem_region_t** region, mi_bitmap_index_t* bit_idx, size_t* arena_memid) {
-  if ((id&1)==1) {
-    if (arena_memid != NULL) *arena_memid = (id>>1);
-    return true;
-  }
-  else {
-    size_t idx = (id >> 1) / MI_BITMAP_FIELD_BITS;
-    *bit_idx   = (mi_bitmap_index_t)(id>>1) % MI_BITMAP_FIELD_BITS;
-    *region    = &regions[idx];
-    return false;
-  }
-}
-
-
-/* ----------------------------------------------------------------------------
-  Allocate a region is allocated from the OS (or an arena)
------------------------------------------------------------------------------*/
-
-static bool mi_region_try_alloc_os(size_t blocks, bool commit, bool allow_large, mem_region_t** region, mi_bitmap_index_t* bit_idx, mi_os_tld_t* tld)
-{
-  // not out of regions yet?
-  if (mi_atomic_load_relaxed(&regions_count) >= MI_REGION_MAX - 1) return false;
-
-  // try to allocate a fresh region from the OS
-  bool region_commit = (commit && mi_option_is_enabled(mi_option_eager_region_commit));
-  bool region_large = (commit && allow_large);
-  bool is_zero = false;
-  bool is_pinned = false;
-  size_t arena_memid = 0;
-  void* const start = _mi_arena_alloc_aligned(MI_REGION_SIZE, MI_SEGMENT_ALIGN, &region_commit, &region_large, &is_pinned, &is_zero, &arena_memid, tld);
-  if (start == NULL) return false;
-  mi_assert_internal(!(region_large && !allow_large));
-  mi_assert_internal(!region_large || region_commit);
-
-  // claim a fresh slot
-  const uintptr_t idx = mi_atomic_increment_acq_rel(&regions_count);
-  if (idx >= MI_REGION_MAX) {
-    mi_atomic_decrement_acq_rel(&regions_count);
-    _mi_arena_free(start, MI_REGION_SIZE, arena_memid, region_commit, tld->stats);
-    _mi_warning_message("maximum regions used: %zu GiB (perhaps recompile with a larger setting for MI_HEAP_REGION_MAX_SIZE)", _mi_divide_up(MI_HEAP_REGION_MAX_SIZE, GiB));
-    return false;
-  }
-
-  // allocated, initialize and claim the initial blocks
-  mem_region_t* r = &regions[idx];
-  r->arena_memid  = arena_memid;
-  mi_atomic_store_release(&r->in_use, (uintptr_t)0);
-  mi_atomic_store_release(&r->dirty, (is_zero ? 0 : MI_BITMAP_FIELD_FULL));
-  mi_atomic_store_release(&r->commit, (region_commit ? MI_BITMAP_FIELD_FULL : 0));
-  mi_atomic_store_release(&r->reset, (uintptr_t)0);
-  *bit_idx = 0;
-  _mi_bitmap_claim(&r->in_use, 1, blocks, *bit_idx, NULL);
-  mi_atomic_store_ptr_release(void,&r->start, start);
-
-  // and share it 
-  mi_region_info_t info;
-  info.value = 0;                        // initialize the full union to zero
-  info.x.valid = true;
-  info.x.is_large = region_large;
-  info.x.is_pinned = is_pinned;
-  info.x.numa_node = (short)_mi_os_numa_node(tld);
-  mi_atomic_store_release(&r->info, info.value); // now make it available to others
-  *region = r;
-  return true;
-}
-
-/* ----------------------------------------------------------------------------
-  Try to claim blocks in suitable regions
------------------------------------------------------------------------------*/
-
-static bool mi_region_is_suitable(const mem_region_t* region, int numa_node, bool allow_large ) {
-  // initialized at all?
-  mi_region_info_t info;
-  info.value = mi_atomic_load_relaxed(&((mem_region_t*)region)->info);
-  if (info.value==0) return false;
-
-  // numa correct
-  if (numa_node >= 0) {  // use negative numa node to always succeed
-    int rnode = info.x.numa_node;
-    if (rnode >= 0 && rnode != numa_node) return false;
-  }
-
-  // check allow-large
-  if (!allow_large && info.x.is_large) return false;
-
-  return true;
-}
-
-
-static bool mi_region_try_claim(int numa_node, size_t blocks, bool allow_large, mem_region_t** region, mi_bitmap_index_t* bit_idx, mi_os_tld_t* tld)
-{
-  // try all regions for a free slot  
-  const size_t count = mi_atomic_load_relaxed(&regions_count); // monotonic, so ok to be relaxed
-  size_t idx = tld->region_idx; // Or start at 0 to reuse low addresses? Starting at 0 seems to increase latency though
-  for (size_t visited = 0; visited < count; visited++, idx++) {
-    if (idx >= count) idx = 0;  // wrap around
-    mem_region_t* r = &regions[idx];
-    // if this region suits our demand (numa node matches, large OS page matches)
-    if (mi_region_is_suitable(r, numa_node, allow_large)) {
-      // then try to atomically claim a segment(s) in this region
-      if (_mi_bitmap_try_find_claim_field(&r->in_use, 0, blocks, bit_idx)) {
-        tld->region_idx = idx;    // remember the last found position
-        *region = r;
-        return true;
-      }
-    }
-  }
-  return false;
-}
-
-
-static void* mi_region_try_alloc(size_t blocks, bool* commit, bool* large, bool* is_pinned, bool* is_zero, size_t* memid, mi_os_tld_t* tld)
-{
-  mi_assert_internal(blocks <= MI_BITMAP_FIELD_BITS);
-  mem_region_t* region;
-  mi_bitmap_index_t bit_idx;
-  const int numa_node = (_mi_os_numa_node_count() <= 1 ? -1 : _mi_os_numa_node(tld));
-  // try to claim in existing regions
-  if (!mi_region_try_claim(numa_node, blocks, *large, &region, &bit_idx, tld)) {
-    // otherwise try to allocate a fresh region and claim in there
-    if (!mi_region_try_alloc_os(blocks, *commit, *large, &region, &bit_idx, tld)) {
-      // out of regions or memory
-      return NULL;
-    }
-  }
-  
-  // ------------------------------------------------
-  // found a region and claimed `blocks` at `bit_idx`, initialize them now
-  mi_assert_internal(region != NULL);
-  mi_assert_internal(_mi_bitmap_is_claimed(&region->in_use, 1, blocks, bit_idx));
-
-  mi_region_info_t info;
-  info.value = mi_atomic_load_acquire(&region->info);
-  uint8_t* start = (uint8_t*)mi_atomic_load_ptr_acquire(uint8_t,&region->start);
-  mi_assert_internal(!(info.x.is_large && !*large));
-  mi_assert_internal(start != NULL);
-
-  *is_zero   = _mi_bitmap_claim(&region->dirty, 1, blocks, bit_idx, NULL);  
-  *large     = info.x.is_large;
-  *is_pinned = info.x.is_pinned;
-  *memid     = mi_memid_create(region, bit_idx);
-  void* p = start + (mi_bitmap_index_bit_in_field(bit_idx) * MI_SEGMENT_SIZE);
-
-  // commit
-  if (*commit) {
-    // ensure commit
-    bool any_uncommitted;
-    _mi_bitmap_claim(&region->commit, 1, blocks, bit_idx, &any_uncommitted);
-    if (any_uncommitted) {
-      mi_assert_internal(!info.x.is_large && !info.x.is_pinned);
-      bool commit_zero = false;
-      if (!_mi_mem_commit(p, blocks * MI_SEGMENT_SIZE, &commit_zero, tld)) {
-        // failed to commit! unclaim and return
-        mi_bitmap_unclaim(&region->in_use, 1, blocks, bit_idx);
-        return NULL;
-      }
-      if (commit_zero) *is_zero = true;      
-    }
-  }
-  else {
-    // no need to commit, but check if already fully committed
-    *commit = _mi_bitmap_is_claimed(&region->commit, 1, blocks, bit_idx);
-  }  
-  mi_assert_internal(!*commit || _mi_bitmap_is_claimed(&region->commit, 1, blocks, bit_idx));
-
-  // unreset reset blocks
-  if (_mi_bitmap_is_any_claimed(&region->reset, 1, blocks, bit_idx)) {
-    // some blocks are still reset
-    mi_assert_internal(!info.x.is_large && !info.x.is_pinned);
-    mi_assert_internal(!mi_option_is_enabled(mi_option_eager_commit) || *commit || mi_option_get(mi_option_eager_commit_delay) > 0); 
-    mi_bitmap_unclaim(&region->reset, 1, blocks, bit_idx);
-    if (*commit || !mi_option_is_enabled(mi_option_reset_decommits)) { // only if needed
-      bool reset_zero = false;
-      _mi_mem_unreset(p, blocks * MI_SEGMENT_SIZE, &reset_zero, tld);
-      if (reset_zero) *is_zero = true;
-    }
-  }
-  mi_assert_internal(!_mi_bitmap_is_any_claimed(&region->reset, 1, blocks, bit_idx));
-  
-  #if (MI_DEBUG>=2)
-  if (*commit) { ((uint8_t*)p)[0] = 0; }
-  #endif
-  
-  // and return the allocation  
-  mi_assert_internal(p != NULL);  
-  return p;
-}
-
-
-/* ----------------------------------------------------------------------------
- Allocation
------------------------------------------------------------------------------*/
-
-// Allocate `size` memory aligned at `alignment`. Return non NULL on success, with a given memory `id`.
-// (`id` is abstract, but `id = idx*MI_REGION_MAP_BITS + bitidx`)
-void* _mi_mem_alloc_aligned(size_t size, size_t alignment, bool* commit, bool* large, bool* is_pinned, bool* is_zero, size_t* memid, mi_os_tld_t* tld)
-{
-  mi_assert_internal(memid != NULL && tld != NULL);
-  mi_assert_internal(size > 0);
-  *memid = 0;
-  *is_zero = false;
-  *is_pinned = false;
-  bool default_large = false;
-  if (large==NULL) large = &default_large;  // ensure `large != NULL`  
-  if (size == 0) return NULL;
-  size = _mi_align_up(size, _mi_os_page_size());
-
-  // allocate from regions if possible
-  void* p = NULL;
-  size_t arena_memid;
-  const size_t blocks = mi_region_block_count(size);
-  if (blocks <= MI_REGION_MAX_OBJ_BLOCKS && alignment <= MI_SEGMENT_ALIGN) {
-    p = mi_region_try_alloc(blocks, commit, large, is_pinned, is_zero, memid, tld);    
-    if (p == NULL) {
-      _mi_warning_message("unable to allocate from region: size %zu\n", size);
-    }
-  }
-  if (p == NULL) {
-    // and otherwise fall back to the OS
-    p = _mi_arena_alloc_aligned(size, alignment, commit, large, is_pinned, is_zero, &arena_memid, tld);
-    *memid = mi_memid_create_from_arena(arena_memid);
-  }
-
-  if (p != NULL) {
-    mi_assert_internal((uintptr_t)p % alignment == 0);
-#if (MI_DEBUG>=2)
-    if (*commit) { ((uint8_t*)p)[0] = 0; } // ensure the memory is committed
-#endif
-  }
-  return p;
-}
-
-
-
-/* ----------------------------------------------------------------------------
-Free
------------------------------------------------------------------------------*/
-
-// Free previously allocated memory with a given id.
-void _mi_mem_free(void* p, size_t size, size_t id, bool full_commit, bool any_reset, mi_os_tld_t* tld) {
-  mi_assert_internal(size > 0 && tld != NULL);
-  if (p==NULL) return;
-  if (size==0) return;
-  size = _mi_align_up(size, _mi_os_page_size());
-  
-  size_t arena_memid = 0;
-  mi_bitmap_index_t bit_idx;
-  mem_region_t* region;
-  if (mi_memid_is_arena(id,&region,&bit_idx,&arena_memid)) {
-   // was a direct arena allocation, pass through
-    _mi_arena_free(p, size, arena_memid, full_commit, tld->stats);
-  }
-  else {
-    // allocated in a region
-    mi_assert_internal(size <= MI_REGION_MAX_OBJ_SIZE); if (size > MI_REGION_MAX_OBJ_SIZE) return;
-    const size_t blocks = mi_region_block_count(size);
-    mi_assert_internal(blocks + bit_idx <= MI_BITMAP_FIELD_BITS);
-    mi_region_info_t info;
-    info.value = mi_atomic_load_acquire(&region->info);
-    mi_assert_internal(info.value != 0);
-    void* blocks_start = mi_region_blocks_start(region, bit_idx);
-    mi_assert_internal(blocks_start == p); // not a pointer in our area?
-    mi_assert_internal(bit_idx + blocks <= MI_BITMAP_FIELD_BITS);
-    if (blocks_start != p || bit_idx + blocks > MI_BITMAP_FIELD_BITS) return; // or `abort`?
-
-    // committed?
-    if (full_commit && (size % MI_SEGMENT_SIZE) == 0) {
-      _mi_bitmap_claim(&region->commit, 1, blocks, bit_idx, NULL);
-    }
-
-    if (any_reset) {
-      // set the is_reset bits if any pages were reset
-      _mi_bitmap_claim(&region->reset, 1, blocks, bit_idx, NULL);
-    }
-
-    // reset the blocks to reduce the working set.
-    if (!info.x.is_large && !info.x.is_pinned && mi_option_is_enabled(mi_option_segment_reset) 
-       && (mi_option_is_enabled(mi_option_eager_commit) ||
-           mi_option_is_enabled(mi_option_reset_decommits))) // cannot reset halfway committed segments, use only `option_page_reset` instead            
-    {
-      bool any_unreset;
-      _mi_bitmap_claim(&region->reset, 1, blocks, bit_idx, &any_unreset);
-      if (any_unreset) {
-        _mi_abandoned_await_readers(); // ensure no more pending write (in case reset = decommit)
-        _mi_mem_reset(p, blocks * MI_SEGMENT_SIZE, tld);
-      }
-    }    
-
-    // and unclaim
-    bool all_unclaimed = mi_bitmap_unclaim(&region->in_use, 1, blocks, bit_idx);
-    mi_assert_internal(all_unclaimed); UNUSED(all_unclaimed);
-  }
-}
-
-
-/* ----------------------------------------------------------------------------
-  collection
------------------------------------------------------------------------------*/
-void _mi_mem_collect(mi_os_tld_t* tld) {
-  // free every region that has no segments in use.
-  uintptr_t rcount = mi_atomic_load_relaxed(&regions_count);
-  for (size_t i = 0; i < rcount; i++) {
-    mem_region_t* region = &regions[i];
-    if (mi_atomic_load_relaxed(&region->info) != 0) {
-      // if no segments used, try to claim the whole region
-      uintptr_t m = mi_atomic_load_relaxed(&region->in_use);
-      while (m == 0 && !mi_atomic_cas_weak_release(&region->in_use, &m, MI_BITMAP_FIELD_FULL)) { /* nothing */ };
-      if (m == 0) {
-        // on success, free the whole region
-        uint8_t* start = (uint8_t*)mi_atomic_load_ptr_acquire(uint8_t,&regions[i].start);
-        size_t arena_memid = mi_atomic_load_relaxed(&regions[i].arena_memid);
-        uintptr_t commit = mi_atomic_load_relaxed(&regions[i].commit);
-        memset(&regions[i], 0, sizeof(mem_region_t));
-        // and release the whole region
-        mi_atomic_store_release(&region->info, (uintptr_t)0);
-        if (start != NULL) { // && !_mi_os_is_huge_reserved(start)) {         
-          _mi_abandoned_await_readers(); // ensure no pending reads
-          _mi_arena_free(start, MI_REGION_SIZE, arena_memid, (~commit == 0), tld->stats);
-        }
-      }
-    }
-  }
-}
-
-
-/* ----------------------------------------------------------------------------
-  Other
------------------------------------------------------------------------------*/
-
-bool _mi_mem_reset(void* p, size_t size, mi_os_tld_t* tld) {
-  return _mi_os_reset(p, size, tld->stats);
-}
-
-bool _mi_mem_unreset(void* p, size_t size, bool* is_zero, mi_os_tld_t* tld) {
-  return _mi_os_unreset(p, size, is_zero, tld->stats);
-}
-
-bool _mi_mem_commit(void* p, size_t size, bool* is_zero, mi_os_tld_t* tld) {
-  return _mi_os_commit(p, size, is_zero, tld->stats);
-}
-
-bool _mi_mem_decommit(void* p, size_t size, mi_os_tld_t* tld) {
-  return _mi_os_decommit(p, size, tld->stats);
-}
-
-bool _mi_mem_protect(void* p, size_t size) {
-  return _mi_os_protect(p, size);
-}
-
-bool _mi_mem_unprotect(void* p, size_t size) {
-  return _mi_os_unprotect(p, size);
-}
diff --git a/contrib/libs/mimalloc/src/segment-map.c b/contrib/libs/mimalloc/src/segment-map.c
new file mode 100644
index 000000000000..1efb1e2360bf
--- /dev/null
+++ b/contrib/libs/mimalloc/src/segment-map.c
@@ -0,0 +1,155 @@
+/* ----------------------------------------------------------------------------
+Copyright (c) 2019-2023, Microsoft Research, Daan Leijen
+This is free software; you can redistribute it and/or modify it under the
+terms of the MIT license. A copy of the license can be found in the file
+"LICENSE" at the root of this distribution.
+-----------------------------------------------------------------------------*/
+
+/* -----------------------------------------------------------
+  The following functions are to reliably find the segment or
+  block that encompasses any pointer p (or NULL if it is not
+  in any of our segments).
+  We maintain a bitmap of all memory with 1 bit per MI_SEGMENT_SIZE (64MiB)
+  set to 1 if it contains the segment meta data.
+----------------------------------------------------------- */
+#include "mimalloc.h"
+#include "mimalloc/internal.h"
+#include "mimalloc/atomic.h"
+
+#if (MI_INTPTR_SIZE>=8) && MI_TRACK_ASAN
+#define MI_MAX_ADDRESS    ((size_t)140 << 40) // 140TB (see issue #881)
+#elif (MI_INTPTR_SIZE >= 8)
+#define MI_MAX_ADDRESS    ((size_t)40 << 40)  // 40TB (to include huge page areas)
+#else
+#define MI_MAX_ADDRESS    ((size_t)2 << 30)   // 2Gb
+#endif
+
+#define MI_SEGMENT_MAP_BITS  (MI_MAX_ADDRESS / MI_SEGMENT_SIZE)
+#define MI_SEGMENT_MAP_SIZE  (MI_SEGMENT_MAP_BITS / 8)
+#define MI_SEGMENT_MAP_WSIZE (MI_SEGMENT_MAP_SIZE / MI_INTPTR_SIZE)
+
+static _Atomic(uintptr_t) mi_segment_map[MI_SEGMENT_MAP_WSIZE + 1];  // 2KiB per TB with 64MiB segments
+
+static size_t mi_segment_map_index_of(const mi_segment_t* segment, size_t* bitidx) {
+  // note: segment can be invalid or NULL.
+  mi_assert_internal(_mi_ptr_segment(segment + 1) == segment); // is it aligned on MI_SEGMENT_SIZE?
+  if ((uintptr_t)segment >= MI_MAX_ADDRESS) {
+    *bitidx = 0;
+    return MI_SEGMENT_MAP_WSIZE;
+  }
+  else {
+    const uintptr_t segindex = ((uintptr_t)segment) / MI_SEGMENT_SIZE;
+    *bitidx = segindex % MI_INTPTR_BITS;
+    const size_t mapindex = segindex / MI_INTPTR_BITS;
+    mi_assert_internal(mapindex < MI_SEGMENT_MAP_WSIZE);
+    return mapindex;
+  }
+}
+
+void _mi_segment_map_allocated_at(const mi_segment_t* segment) {
+  size_t bitidx;
+  size_t index = mi_segment_map_index_of(segment, &bitidx);
+  mi_assert_internal(index <= MI_SEGMENT_MAP_WSIZE);
+  if (index==MI_SEGMENT_MAP_WSIZE) return;
+  uintptr_t mask = mi_atomic_load_relaxed(&mi_segment_map[index]);
+  uintptr_t newmask;
+  do {
+    newmask = (mask | ((uintptr_t)1 << bitidx));
+  } while (!mi_atomic_cas_weak_release(&mi_segment_map[index], &mask, newmask));
+}
+
+void _mi_segment_map_freed_at(const mi_segment_t* segment) {
+  size_t bitidx;
+  size_t index = mi_segment_map_index_of(segment, &bitidx);
+  mi_assert_internal(index <= MI_SEGMENT_MAP_WSIZE);
+  if (index == MI_SEGMENT_MAP_WSIZE) return;
+  uintptr_t mask = mi_atomic_load_relaxed(&mi_segment_map[index]);
+  uintptr_t newmask;
+  do {
+    newmask = (mask & ~((uintptr_t)1 << bitidx));
+  } while (!mi_atomic_cas_weak_release(&mi_segment_map[index], &mask, newmask));
+}
+
+// Determine the segment belonging to a pointer or NULL if it is not in a valid segment.
+static mi_segment_t* _mi_segment_of(const void* p) {
+  if (p == NULL) return NULL;
+  mi_segment_t* segment = _mi_ptr_segment(p);  // segment can be NULL  
+  size_t bitidx;
+  size_t index = mi_segment_map_index_of(segment, &bitidx);
+  // fast path: for any pointer to valid small/medium/large object or first MI_SEGMENT_SIZE in huge
+  const uintptr_t mask = mi_atomic_load_relaxed(&mi_segment_map[index]);
+  if mi_likely((mask & ((uintptr_t)1 << bitidx)) != 0) {
+    return segment; // yes, allocated by us
+  }
+  if (index==MI_SEGMENT_MAP_WSIZE) return NULL;
+
+  // TODO: maintain max/min allocated range for efficiency for more efficient rejection of invalid pointers?
+
+  // search downwards for the first segment in case it is an interior pointer
+  // could be slow but searches in MI_INTPTR_SIZE * MI_SEGMENT_SIZE (512MiB) steps trough
+  // valid huge objects
+  // note: we could maintain a lowest index to speed up the path for invalid pointers?
+  size_t lobitidx;
+  size_t loindex;
+  uintptr_t lobits = mask & (((uintptr_t)1 << bitidx) - 1);
+  if (lobits != 0) {
+    loindex = index;
+    lobitidx = mi_bsr(lobits);    // lobits != 0
+  }
+  else if (index == 0) {
+    return NULL;
+  }
+  else {
+    mi_assert_internal(index > 0);
+    uintptr_t lomask = mask;
+    loindex = index;
+    do {
+      loindex--;  
+      lomask = mi_atomic_load_relaxed(&mi_segment_map[loindex]);      
+    } while (lomask != 0 && loindex > 0);
+    if (lomask == 0) return NULL;
+    lobitidx = mi_bsr(lomask);    // lomask != 0
+  }
+  mi_assert_internal(loindex < MI_SEGMENT_MAP_WSIZE);
+  // take difference as the addresses could be larger than the MAX_ADDRESS space.
+  size_t diff = (((index - loindex) * (8*MI_INTPTR_SIZE)) + bitidx - lobitidx) * MI_SEGMENT_SIZE;
+  segment = (mi_segment_t*)((uint8_t*)segment - diff);
+
+  if (segment == NULL) return NULL;
+  mi_assert_internal((void*)segment < p);
+  bool cookie_ok = (_mi_ptr_cookie(segment) == segment->cookie);
+  mi_assert_internal(cookie_ok);
+  if mi_unlikely(!cookie_ok) return NULL;
+  if (((uint8_t*)segment + mi_segment_size(segment)) <= (uint8_t*)p) return NULL; // outside the range
+  mi_assert_internal(p >= (void*)segment && (uint8_t*)p < (uint8_t*)segment + mi_segment_size(segment));
+  return segment;
+}
+
+// Is this a valid pointer in our heap?
+static bool  mi_is_valid_pointer(const void* p) {
+  return ((_mi_segment_of(p) != NULL) || (_mi_arena_contains(p)));
+}
+
+mi_decl_nodiscard mi_decl_export bool mi_is_in_heap_region(const void* p) mi_attr_noexcept {
+  return mi_is_valid_pointer(p);
+}
+
+/*
+// Return the full segment range belonging to a pointer
+static void* mi_segment_range_of(const void* p, size_t* size) {
+  mi_segment_t* segment = _mi_segment_of(p);
+  if (segment == NULL) {
+    if (size != NULL) *size = 0;
+    return NULL;
+  }
+  else {
+    if (size != NULL) *size = segment->segment_size;
+    return segment;
+  }
+  mi_assert_expensive(page == NULL || mi_segment_is_valid(_mi_page_segment(page),tld));
+  mi_assert_internal(page == NULL || (mi_segment_page_size(_mi_page_segment(page)) - (MI_SECURE == 0 ? 0 : _mi_os_page_size())) >= block_size);
+  mi_reset_delayed(tld);
+  mi_assert_internal(page == NULL || mi_page_not_in_queue(page, tld));
+  return page;
+}
+*/
diff --git a/contrib/libs/mimalloc/src/segment.c b/contrib/libs/mimalloc/src/segment.c
index 1d59be9d06e6..fc13d2e7784b 100644
--- a/contrib/libs/mimalloc/src/segment.c
+++ b/contrib/libs/mimalloc/src/segment.c
@@ -1,12 +1,12 @@
 /* ----------------------------------------------------------------------------
-Copyright (c) 2018-2020, Microsoft Research, Daan Leijen
+Copyright (c) 2018-2024, Microsoft Research, Daan Leijen
 This is free software; you can redistribute it and/or modify it under the
 terms of the MIT license. A copy of the license can be found in the file
 "LICENSE" at the root of this distribution.
 -----------------------------------------------------------------------------*/
 #include "mimalloc.h"
-#include "mimalloc-internal.h"
-#include "mimalloc-atomic.h"
+#include "mimalloc/internal.h"
+#include "mimalloc/atomic.h"
 
 #include <string.h>  // memset
 #include <stdio.h>
@@ -17,22 +17,23 @@ static uint8_t* mi_segment_raw_page_start(const mi_segment_t* segment, const mi_
 
 /* --------------------------------------------------------------------------------
   Segment allocation
-  We allocate pages inside bigger "segments" (4mb on 64-bit). This is to avoid
+  We allocate pages inside bigger "segments" (4MiB on 64-bit). This is to avoid
   splitting VMA's on Linux and reduce fragmentation on other OS's.
   Each thread owns its own segments.
 
   Currently we have:
-  - small pages (64kb), 64 in one segment
-  - medium pages (512kb), 8 in one segment
-  - large pages (4mb), 1 in one segment
-  - huge blocks > MI_LARGE_OBJ_SIZE_MAX become large segment with 1 page
+  - small pages (64KiB), 64 in one segment
+  - medium pages (512KiB), 8 in one segment
+  - large pages (4MiB), 1 in one segment
+  - huge segments have 1 page in one segment that can be larger than `MI_SEGMENT_SIZE`.
+    it is used for blocks `> MI_LARGE_OBJ_SIZE_MAX` or with alignment `> MI_BLOCK_ALIGNMENT_MAX`.
 
-  In any case the memory for a segment is virtual and usually committed on demand.
+  The memory for a segment is usually committed on demand.
   (i.e. we are careful to not touch the memory until we actually allocate a block there)
 
-  If a  thread ends, it "abandons" pages with used blocks
-  and there is an abandoned segment list whose segments can
-  be reclaimed by still running threads, much like work-stealing.
+  If a  thread ends, it "abandons" pages that still contain live blocks.
+  Such segments are abondoned and these can be reclaimed by still running threads,
+  (much like work-stealing).
 -------------------------------------------------------------------------------- */
 
 
@@ -54,9 +55,11 @@ static bool mi_segment_queue_contains(const mi_segment_queue_t* queue, const mi_
 }
 #endif
 
+/*
 static bool mi_segment_queue_is_empty(const mi_segment_queue_t* queue) {
   return (queue->first == NULL);
 }
+*/
 
 static void mi_segment_queue_remove(mi_segment_queue_t* queue, mi_segment_t* segment) {
   mi_assert_expensive(mi_segment_queue_contains(queue, segment));
@@ -110,17 +113,7 @@ static void mi_segment_insert_in_free_queue(mi_segment_t* segment, mi_segments_t
  Invariant checking
 ----------------------------------------------------------- */
 
-#if (MI_DEBUG>=2)
-static bool mi_segment_is_in_free_queue(const mi_segment_t* segment, mi_segments_tld_t* tld) {
-  mi_segment_queue_t* queue = mi_segment_free_queue(segment, tld);
-  bool in_queue = (queue!=NULL && (segment->next != NULL || segment->prev != NULL || queue->first == segment));
-  if (in_queue) {
-    mi_assert_expensive(mi_segment_queue_contains(queue, segment));
-  }
-  return in_queue;
-}
-#endif
-
+#if (MI_DEBUG >= 2) || (MI_SECURE >= 2)
 static size_t mi_segment_page_size(const mi_segment_t* segment) {
   if (segment->capacity > 1) {
     mi_assert_internal(segment->page_kind <= MI_PAGE_MEDIUM);
@@ -131,11 +124,11 @@ static size_t mi_segment_page_size(const mi_segment_t* segment) {
     return segment->segment_size;
   }
 }
-
+#endif
 
 #if (MI_DEBUG>=2)
-static bool mi_pages_reset_contains(const mi_page_t* page, mi_segments_tld_t* tld) {
-  mi_page_t* p = tld->pages_reset.first;
+static bool mi_pages_purge_contains(const mi_page_t* page, mi_segments_tld_t* tld) {
+  mi_page_t* p = tld->pages_purge.first;
   while (p != NULL) {
     if (p == page) return true;
     p = p->next;
@@ -150,15 +143,17 @@ static bool mi_segment_is_valid(const mi_segment_t* segment, mi_segments_tld_t*
   mi_assert_internal(_mi_ptr_cookie(segment) == segment->cookie);
   mi_assert_internal(segment->used <= segment->capacity);
   mi_assert_internal(segment->abandoned <= segment->used);
+  mi_assert_internal(segment->page_kind <= MI_PAGE_MEDIUM || segment->capacity == 1); // one large or huge page per segment
   size_t nfree = 0;
   for (size_t i = 0; i < segment->capacity; i++) {
     const mi_page_t* const page = &segment->pages[i];
     if (!page->segment_in_use) {
       nfree++;
     }
-    if (page->segment_in_use || page->is_reset) {
-      mi_assert_expensive(!mi_pages_reset_contains(page, tld));
+    if (page->segment_in_use) {
+      mi_assert_expensive(!mi_pages_purge_contains(page, tld));
     }
+    mi_assert_internal(page->is_huge == (segment->page_kind == MI_PAGE_HUGE));
   }
   mi_assert_internal(nfree + segment->used == segment->capacity);
   // mi_assert_internal(segment->thread_id == _mi_thread_id() || (segment->thread_id==0)); // or 0
@@ -171,12 +166,12 @@ static bool mi_segment_is_valid(const mi_segment_t* segment, mi_segments_tld_t*
 static bool mi_page_not_in_queue(const mi_page_t* page, mi_segments_tld_t* tld) {
   mi_assert_internal(page != NULL);
   if (page->next != NULL || page->prev != NULL) {
-    mi_assert_internal(mi_pages_reset_contains(page, tld));
+    mi_assert_internal(mi_pages_purge_contains(page, tld));
     return false;
   }
   else {
     // both next and prev are NULL, check for singleton list
-    return (tld->pages_reset.first != page && tld->pages_reset.last != page);
+    return (tld->pages_purge.first != page && tld->pages_purge.last != page);
   }
 }
 
@@ -187,10 +182,10 @@ static bool mi_page_not_in_queue(const mi_page_t* page, mi_segments_tld_t* tld)
 
 static void mi_segment_protect_range(void* p, size_t size, bool protect) {
   if (protect) {
-    _mi_mem_protect(p, size);
+    _mi_os_protect(p, size);
   }
   else {
-    _mi_mem_unprotect(p, size);
+    _mi_os_unprotect(p, size);
   }
 }
 
@@ -202,14 +197,17 @@ static void mi_segment_protect(mi_segment_t* segment, bool protect, mi_os_tld_t*
     mi_assert_internal((segment->segment_info_size - os_psize) >= (sizeof(mi_segment_t) + ((segment->capacity - 1) * sizeof(mi_page_t))));
     mi_assert_internal(((uintptr_t)segment + segment->segment_info_size) % os_psize == 0);
     mi_segment_protect_range((uint8_t*)segment + segment->segment_info_size - os_psize, os_psize, protect);
-    if (MI_SECURE <= 1 || segment->capacity == 1) {
+    #if (MI_SECURE >= 2)
+    if (segment->capacity == 1)
+    #endif
+    {
       // and protect the last (or only) page too
       mi_assert_internal(MI_SECURE <= 1 || segment->page_kind >= MI_PAGE_LARGE);
       uint8_t* start = (uint8_t*)segment + segment->segment_size - os_psize;
-      if (protect && !segment->mem_is_committed) {
+      if (protect && !segment->memid.initially_committed) {
         if (protect) {
           // ensure secure page is committed
-          if (_mi_mem_commit(start, os_psize, NULL, tld)) {  // if this fails that is ok (as it is an unaccessible page)
+          if (_mi_os_commit(start, os_psize, NULL, tld->stats)) {  // if this fails that is ok (as it is an unaccessible page)
             mi_segment_protect_range(start, os_psize, protect);
           }
         }
@@ -218,6 +216,7 @@ static void mi_segment_protect(mi_segment_t* segment, bool protect, mi_os_tld_t*
         mi_segment_protect_range(start, os_psize, protect);
       }
     }
+    #if (MI_SECURE >= 2)
     else {
       // or protect every page
       const size_t page_size = mi_segment_page_size(segment);
@@ -227,6 +226,7 @@ static void mi_segment_protect(mi_segment_t* segment, bool protect, mi_os_tld_t*
         }
       }
     }
+    #endif
   }
 }
 
@@ -234,35 +234,39 @@ static void mi_segment_protect(mi_segment_t* segment, bool protect, mi_os_tld_t*
   Page reset
 ----------------------------------------------------------- */
 
-static void mi_page_reset(mi_segment_t* segment, mi_page_t* page, size_t size, mi_segments_tld_t* tld) {
+static void mi_page_purge(mi_segment_t* segment, mi_page_t* page, mi_segments_tld_t* tld) {
+  // todo: should we purge the guard page as well when MI_SECURE>=2 ?
   mi_assert_internal(page->is_committed);
-  if (!mi_option_is_enabled(mi_option_page_reset)) return;
-  if (segment->mem_is_pinned || page->segment_in_use || !page->is_committed || page->is_reset) return;
+  mi_assert_internal(!page->segment_in_use);
+  if (!segment->allow_purge) return;
+  mi_assert_internal(page->used == 0);
+  mi_assert_internal(page->free == NULL);
+  mi_assert_expensive(!mi_pages_purge_contains(page, tld));
   size_t psize;
   void* start = mi_segment_raw_page_start(segment, page, &psize);
-  page->is_reset = true;
-  mi_assert_internal(size <= psize);
-  size_t reset_size = ((size == 0 || size > psize) ? psize : size);
-  if (reset_size > 0) _mi_mem_reset(start, reset_size, tld->os);
+  const bool needs_recommit = _mi_os_purge(start, psize, tld->stats);
+  if (needs_recommit) { page->is_committed = false; }
 }
 
-static bool mi_page_unreset(mi_segment_t* segment, mi_page_t* page, size_t size, mi_segments_tld_t* tld)
-{
-  mi_assert_internal(page->is_reset);
-  mi_assert_internal(page->is_committed);
-  mi_assert_internal(!segment->mem_is_pinned);
-  if (segment->mem_is_pinned || !page->is_committed || !page->is_reset) return true;
-  page->is_reset = false;
+static bool mi_page_ensure_committed(mi_segment_t* segment, mi_page_t* page, mi_segments_tld_t* tld) {
+  if (page->is_committed) return true;
+  mi_assert_internal(segment->allow_decommit);
+  mi_assert_expensive(!mi_pages_purge_contains(page, tld));
+
   size_t psize;
   uint8_t* start = mi_segment_raw_page_start(segment, page, &psize);
-  size_t unreset_size = (size == 0 || size > psize ? psize : size);
   bool is_zero = false;
-  bool ok = true;
-  if (unreset_size > 0) {
-    ok = _mi_mem_unreset(start, unreset_size, &is_zero, tld->os);
+  const size_t gsize = (MI_SECURE >= 2 ? _mi_os_page_size() : 0);
+  bool ok = _mi_os_commit(start, psize + gsize, &is_zero, tld->stats);
+  if (!ok) return false; // failed to commit!
+  page->is_committed = true;
+  page->used = 0;
+  page->free = NULL;
+  page->is_zero_init = is_zero;
+  if (gsize > 0) {
+    mi_segment_protect_range(start + psize, gsize, true);
   }
-  if (is_zero) page->is_zero_init = true;
-  return ok;
+  return true;
 }
 
 
@@ -270,37 +274,49 @@ static bool mi_page_unreset(mi_segment_t* segment, mi_page_t* page, size_t size,
   The free page queue
 ----------------------------------------------------------- */
 
-// we re-use the `used` field for the expiration counter. Since this is a
-// a 32-bit field while the clock is always 64-bit we need to guard
-// against overflow, we use substraction to check for expiry which work
+// we re-use the `free` field for the expiration counter. Since this is a
+// a pointer size field while the clock is always 64-bit we need to guard
+// against overflow, we use substraction to check for expiry which works
 // as long as the reset delay is under (2^30 - 1) milliseconds (~12 days)
-static void mi_page_reset_set_expire(mi_page_t* page) {
-  uint32_t expire = (uint32_t)_mi_clock_now() + mi_option_get(mi_option_reset_delay);
-  page->used = expire;
+static uint32_t mi_page_get_expire( mi_page_t* page ) {
+  return (uint32_t)((uintptr_t)page->free);
 }
 
-static bool mi_page_reset_is_expired(mi_page_t* page, mi_msecs_t now) {
-  int32_t expire = (int32_t)(page->used);
+static void mi_page_set_expire( mi_page_t* page, uint32_t expire ) {
+  page->free = (mi_block_t*)((uintptr_t)expire);
+}
+
+static void mi_page_purge_set_expire(mi_page_t* page) {
+  mi_assert_internal(mi_page_get_expire(page)==0);
+  uint32_t expire = (uint32_t)_mi_clock_now() + mi_option_get(mi_option_purge_delay);
+  mi_page_set_expire(page, expire);
+}
+
+// we re-use the `free` field for the expiration counter. Since this is a
+// a pointer size field while the clock is always 64-bit we need to guard
+// against overflow, we use substraction to check for expiry which work
+// as long as the reset delay is under (2^30 - 1) milliseconds (~12 days)
+static bool mi_page_purge_is_expired(mi_page_t* page, mi_msecs_t now) {
+  int32_t expire = (int32_t)mi_page_get_expire(page);
   return (((int32_t)now - expire) >= 0);
 }
 
-static void mi_pages_reset_add(mi_segment_t* segment, mi_page_t* page, mi_segments_tld_t* tld) {
-  mi_assert_internal(!page->segment_in_use || !page->is_committed);
+static void mi_segment_schedule_purge(mi_segment_t* segment, mi_page_t* page, mi_segments_tld_t* tld) {
+  mi_assert_internal(!page->segment_in_use);
   mi_assert_internal(mi_page_not_in_queue(page,tld));
-  mi_assert_expensive(!mi_pages_reset_contains(page, tld));
+  mi_assert_expensive(!mi_pages_purge_contains(page, tld));
   mi_assert_internal(_mi_page_segment(page)==segment);
-  if (!mi_option_is_enabled(mi_option_page_reset)) return;
-  if (segment->mem_is_pinned || page->segment_in_use || !page->is_committed || page->is_reset) return;
+  if (!segment->allow_purge) return;
 
-  if (mi_option_get(mi_option_reset_delay) == 0) {
-    // reset immediately?
-    mi_page_reset(segment, page, 0, tld);
+  if (mi_option_get(mi_option_purge_delay) == 0) {
+    // purge immediately?
+    mi_page_purge(segment, page, tld);
   }
-  else {
+  else if (mi_option_get(mi_option_purge_delay) > 0) {   // no purging if the delay is negative
     // otherwise push on the delayed page reset queue
-    mi_page_queue_t* pq = &tld->pages_reset;
+    mi_page_queue_t* pq = &tld->pages_purge;
     // push on top
-    mi_page_reset_set_expire(page);
+    mi_page_purge_set_expire(page);
     page->next = pq->first;
     page->prev = NULL;
     if (pq->first == NULL) {
@@ -314,29 +330,30 @@ static void mi_pages_reset_add(mi_segment_t* segment, mi_page_t* page, mi_segmen
   }
 }
 
-static void mi_pages_reset_remove(mi_page_t* page, mi_segments_tld_t* tld) {
+static void mi_page_purge_remove(mi_page_t* page, mi_segments_tld_t* tld) {
   if (mi_page_not_in_queue(page,tld)) return;
 
-  mi_page_queue_t* pq = &tld->pages_reset;
+  mi_page_queue_t* pq = &tld->pages_purge;
   mi_assert_internal(pq!=NULL);
   mi_assert_internal(!page->segment_in_use);
-  mi_assert_internal(mi_pages_reset_contains(page, tld));
+  mi_assert_internal(mi_page_get_expire(page) != 0);
+  mi_assert_internal(mi_pages_purge_contains(page, tld));
   if (page->prev != NULL) page->prev->next = page->next;
   if (page->next != NULL) page->next->prev = page->prev;
   if (page == pq->last)  pq->last = page->prev;
   if (page == pq->first) pq->first = page->next;
   page->next = page->prev = NULL;
-  page->used = 0;
+  mi_page_set_expire(page,0);
 }
 
-static void mi_pages_reset_remove_all_in_segment(mi_segment_t* segment, bool force_reset, mi_segments_tld_t* tld) {
-  if (segment->mem_is_pinned) return; // never reset in huge OS pages
+static void mi_segment_remove_all_purges(mi_segment_t* segment, bool force_purge, mi_segments_tld_t* tld) {
+  if (segment->memid.is_pinned) return; // never reset in huge OS pages
   for (size_t i = 0; i < segment->capacity; i++) {
     mi_page_t* page = &segment->pages[i];
-    if (!page->segment_in_use && page->is_committed && !page->is_reset) {
-      mi_pages_reset_remove(page, tld);
-      if (force_reset) {
-        mi_page_reset(segment, page, 0, tld);
+    if (!page->segment_in_use) {
+      mi_page_purge_remove(page, tld);
+      if (force_purge && page->is_committed) {
+        mi_page_purge(segment, page, tld);
       }
     }
     else {
@@ -345,17 +362,17 @@ static void mi_pages_reset_remove_all_in_segment(mi_segment_t* segment, bool for
   }
 }
 
-static void mi_reset_delayed(mi_segments_tld_t* tld) {
-  if (!mi_option_is_enabled(mi_option_page_reset)) return;
+static void mi_pages_try_purge(bool force, mi_segments_tld_t* tld) {
+  if (mi_option_get(mi_option_purge_delay) < 0) return;  // purging is not allowed
+
   mi_msecs_t now = _mi_clock_now();
-  mi_page_queue_t* pq = &tld->pages_reset;
+  mi_page_queue_t* pq = &tld->pages_purge;
   // from oldest up to the first that has not expired yet
   mi_page_t* page = pq->last;
-  while (page != NULL && mi_page_reset_is_expired(page,now)) {
+  while (page != NULL && (force || mi_page_purge_is_expired(page,now))) {
     mi_page_t* const prev = page->prev; // save previous field
-    mi_page_reset(_mi_page_segment(page), page, 0, tld);
-    page->used = 0;
-    page->prev = page->next = NULL;
+    mi_page_purge_remove(page, tld);    // remove from the list to maintain invariant for mi_page_purge
+    mi_page_purge(_mi_page_segment(page), page, tld);
     page = prev;
   }
   // discard the reset pages from the queue
@@ -373,10 +390,14 @@ static void mi_reset_delayed(mi_segments_tld_t* tld) {
  Segment size calculations
 ----------------------------------------------------------- */
 
+static size_t mi_segment_raw_page_size(const mi_segment_t* segment) {
+  return (segment->page_kind == MI_PAGE_HUGE ? segment->segment_size : (size_t)1 << segment->page_shift);
+}
+
 // Raw start of the page available memory; can be used on uninitialized pages (only `segment_idx` must be set)
 // The raw start is not taking aligned block allocation into consideration.
 static uint8_t* mi_segment_raw_page_start(const mi_segment_t* segment, const mi_page_t* page, size_t* page_size) {
-  size_t   psize = (segment->page_kind == MI_PAGE_HUGE ? segment->segment_size : (size_t)1 << segment->page_shift);
+  size_t   psize = mi_segment_raw_page_size(segment);
   uint8_t* p = (uint8_t*)segment + page->segment_idx * psize;
 
   if (page->segment_idx == 0) {
@@ -394,35 +415,36 @@ static uint8_t* mi_segment_raw_page_start(const mi_segment_t* segment, const mi_
 #endif
 
   if (page_size != NULL) *page_size = psize;
-  mi_assert_internal(page->xblock_size == 0 || _mi_ptr_page(p) == page);
+  mi_assert_internal(page->block_size == 0 || _mi_ptr_page(p) == page);
   mi_assert_internal(_mi_ptr_segment(p) == segment);
   return p;
 }
 
 // Start of the page available memory; can be used on uninitialized pages (only `segment_idx` must be set)
-uint8_t* _mi_segment_page_start(const mi_segment_t* segment, const mi_page_t* page, size_t block_size, size_t* page_size, size_t* pre_size)
+uint8_t* _mi_segment_page_start(const mi_segment_t* segment, const mi_page_t* page, size_t* page_size)
 {
   size_t   psize;
   uint8_t* p = mi_segment_raw_page_start(segment, page, &psize);
-  if (pre_size != NULL) *pre_size = 0;
-  if (page->segment_idx == 0 && block_size > 0 && segment->page_kind <= MI_PAGE_MEDIUM) {
+  const size_t block_size = mi_page_block_size(page);
+  if (/*page->segment_idx == 0 &&*/ block_size > 0 && block_size <= MI_MAX_ALIGN_GUARANTEE) {
     // for small and medium objects, ensure the page start is aligned with the block size (PR#66 by kickunderscore)
+    mi_assert_internal(segment->page_kind <= MI_PAGE_MEDIUM);
     size_t adjust = block_size - ((uintptr_t)p % block_size);
-    if (adjust < block_size) {
+    if (adjust < block_size && psize >= block_size + adjust) {
       p += adjust;
       psize -= adjust;
-      if (pre_size != NULL) *pre_size = adjust;
+      mi_assert_internal((uintptr_t)p % block_size == 0);
     }
-    mi_assert_internal((uintptr_t)p % block_size == 0);
   }
 
   if (page_size != NULL) *page_size = psize;
-  mi_assert_internal(page->xblock_size==0 || _mi_ptr_page(p) == page);
+  mi_assert_internal(_mi_ptr_page(p) == page);
   mi_assert_internal(_mi_ptr_segment(p) == segment);
   return p;
 }
 
-static size_t mi_segment_size(size_t capacity, size_t required, size_t* pre_size, size_t* info_size)
+
+static size_t mi_segment_calculate_sizes(size_t capacity, size_t required, size_t* pre_size, size_t* info_size)
 {
   const size_t minsize   = sizeof(mi_segment_t) + ((capacity - 1) * sizeof(mi_page_t)) + 16 /* padding */;
   size_t guardsize = 0;
@@ -464,110 +486,104 @@ static void mi_segments_track_size(long segment_size, mi_segments_tld_t* tld) {
 
 static void mi_segment_os_free(mi_segment_t* segment, size_t segment_size, mi_segments_tld_t* tld) {
   segment->thread_id = 0;
+  _mi_segment_map_freed_at(segment);
   mi_segments_track_size(-((long)segment_size),tld);
+  if (segment->was_reclaimed) {
+    tld->reclaim_count--;
+    segment->was_reclaimed = false;
+  }
+
   if (MI_SECURE != 0) {
-    mi_assert_internal(!segment->mem_is_pinned);
+    mi_assert_internal(!segment->memid.is_pinned);
     mi_segment_protect(segment, false, tld->os); // ensure no more guard pages are set
   }
 
-  bool any_reset = false;
   bool fully_committed = true;
+  size_t committed_size = 0;
+  const size_t page_size = mi_segment_raw_page_size(segment);
   for (size_t i = 0; i < segment->capacity; i++) {
     mi_page_t* page = &segment->pages[i];
+    if (page->is_committed)  { committed_size += page_size;  }
     if (!page->is_committed) { fully_committed = false; }
-    if (page->is_reset)      { any_reset = true; }
-  }
-  if (any_reset && mi_option_is_enabled(mi_option_reset_decommits)) {
-    fully_committed = false;
   }
-  _mi_mem_free(segment, segment_size, segment->memid, fully_committed, any_reset, tld->os);
+  MI_UNUSED(fully_committed);
+  mi_assert_internal((fully_committed && committed_size == segment_size) || (!fully_committed && committed_size < segment_size));
+
+  _mi_abandoned_await_readers(); // prevent ABA issue if concurrent readers try to access our memory (that might be purged)
+  _mi_arena_free(segment, segment_size, committed_size, segment->memid, tld->stats);
 }
 
+// called from `heap_collect`. 
+void _mi_segments_collect(bool force, mi_segments_tld_t* tld) {
+  mi_pages_try_purge(force,tld);
+  #if MI_DEBUG>=2
+  if (!_mi_is_main_thread()) {
+    mi_assert_internal(tld->pages_purge.first == NULL);
+    mi_assert_internal(tld->pages_purge.last == NULL);
+  }
+  #endif
+}
 
-// The thread local segment cache is limited to be at most 1/8 of the peak size of segments in use,
-#define MI_SEGMENT_CACHE_FRACTION (8)
 
-// note: returned segment may be partially reset
-static mi_segment_t* mi_segment_cache_pop(size_t segment_size, mi_segments_tld_t* tld) {
-  if (segment_size != 0 && segment_size != MI_SEGMENT_SIZE) return NULL;
-  mi_segment_t* segment = tld->cache;
-  if (segment == NULL) return NULL;
-  tld->cache_count--;
-  tld->cache = segment->next;
-  segment->next = NULL;
-  mi_assert_internal(segment->segment_size == MI_SEGMENT_SIZE);
-  _mi_stat_decrease(&tld->stats->segments_cache, 1);
-  return segment;
-}
+/* -----------------------------------------------------------
+   Segment allocation
+----------------------------------------------------------- */
 
-static bool mi_segment_cache_full(mi_segments_tld_t* tld)
+static mi_segment_t* mi_segment_os_alloc(bool eager_delayed, size_t page_alignment, mi_arena_id_t req_arena_id,
+                                         size_t pre_size, size_t info_size, bool commit, size_t segment_size,
+                                         mi_segments_tld_t* tld, mi_os_tld_t* tld_os)
 {
-  // if (tld->count == 1 && tld->cache_count==0) return false; // always cache at least the final segment of a thread
-  size_t max_cache = mi_option_get(mi_option_segment_cache);
-  if (tld->cache_count < max_cache
-       && tld->cache_count < (1 + (tld->peak_count / MI_SEGMENT_CACHE_FRACTION)) // at least allow a 1 element cache
-     ) {
-    return false;
+  mi_memid_t memid;
+  bool   allow_large = (!eager_delayed && (MI_SECURE == 0)); // only allow large OS pages once we are no longer lazy
+  size_t align_offset = 0;
+  size_t alignment = MI_SEGMENT_SIZE;
+  if (page_alignment > 0) {
+    alignment = page_alignment;
+    align_offset = _mi_align_up(pre_size, MI_SEGMENT_SIZE);
+    segment_size = segment_size + (align_offset - pre_size);  // adjust the segment size
   }
-  // take the opportunity to reduce the segment cache if it is too large (now)
-  // TODO: this never happens as we check against peak usage, should we use current usage instead?
-  while (tld->cache_count > max_cache) { //(1 + (tld->peak_count / MI_SEGMENT_CACHE_FRACTION))) {
-    mi_segment_t* segment = mi_segment_cache_pop(0,tld);
-    mi_assert_internal(segment != NULL);
-    if (segment != NULL) mi_segment_os_free(segment, segment->segment_size, tld);
-  }
-  return true;
-}
 
-static bool mi_segment_cache_push(mi_segment_t* segment, mi_segments_tld_t* tld) {
-  mi_assert_internal(!mi_segment_is_in_free_queue(segment, tld));
-  mi_assert_internal(segment->next == NULL);
-  if (segment->segment_size != MI_SEGMENT_SIZE || mi_segment_cache_full(tld)) {
-    return false;
+  mi_segment_t* segment = (mi_segment_t*)_mi_arena_alloc_aligned(segment_size, alignment, align_offset, commit, allow_large, req_arena_id, &memid, tld_os);
+  if (segment == NULL) {
+    return NULL;  // failed to allocate
   }
-  mi_assert_internal(segment->segment_size == MI_SEGMENT_SIZE);
-  segment->next = tld->cache;
-  tld->cache = segment;
-  tld->cache_count++;
-  _mi_stat_increase(&tld->stats->segments_cache,1);
-  return true;
-}
 
-// called by threads that are terminating to free cached segments
-void _mi_segment_thread_collect(mi_segments_tld_t* tld) {
-  mi_segment_t* segment;
-  while ((segment = mi_segment_cache_pop(0,tld)) != NULL) {
-    mi_segment_os_free(segment, segment->segment_size, tld);
-  }
-  mi_assert_internal(tld->cache_count == 0);
-  mi_assert_internal(tld->cache == NULL);
-#if MI_DEBUG>=2
-  if (!_mi_is_main_thread()) {
-    mi_assert_internal(tld->pages_reset.first == NULL);
-    mi_assert_internal(tld->pages_reset.last == NULL);
+  if (!memid.initially_committed) {
+    // ensure the initial info is committed
+    mi_assert_internal(!memid.is_pinned);
+    bool ok = _mi_os_commit(segment, pre_size, NULL, tld_os->stats);
+    if (!ok) {
+      // commit failed; we cannot touch the memory: free the segment directly and return `NULL`
+      _mi_arena_free(segment, segment_size, 0, memid, tld_os->stats);
+      return NULL;
+    }
   }
-#endif
-}
-
 
-/* -----------------------------------------------------------
-   Segment allocation
------------------------------------------------------------ */
+  MI_UNUSED(info_size);
+  segment->memid = memid;
+  segment->allow_decommit = !memid.is_pinned;
+  segment->allow_purge = segment->allow_decommit && (mi_option_get(mi_option_purge_delay) >= 0);
+  segment->segment_size = segment_size;
+  mi_segments_track_size((long)(segment_size), tld);
+  _mi_segment_map_allocated_at(segment);
+  return segment;
+}
 
 // Allocate a segment from the OS aligned to `MI_SEGMENT_SIZE` .
-static mi_segment_t* mi_segment_init(mi_segment_t* segment, size_t required, mi_page_kind_t page_kind, size_t page_shift, mi_segments_tld_t* tld, mi_os_tld_t* os_tld)
+static mi_segment_t* mi_segment_alloc(size_t required, mi_page_kind_t page_kind, size_t page_shift, size_t page_alignment,
+                                      mi_arena_id_t req_arena_id, mi_segments_tld_t* tld, mi_os_tld_t* os_tld)
 {
-  // the segment parameter is non-null if it came from our cache
-  mi_assert_internal(segment==NULL || (required==0 && page_kind <= MI_PAGE_LARGE));
+  // required is only > 0 for huge page allocations
+  mi_assert_internal((required > 0 && page_kind > MI_PAGE_LARGE)|| (required==0 && page_kind <= MI_PAGE_LARGE));
 
   // calculate needed sizes first
   size_t capacity;
   if (page_kind == MI_PAGE_HUGE) {
-    mi_assert_internal(page_shift == MI_SEGMENT_SHIFT && required > 0);
+    mi_assert_internal(page_shift == MI_SEGMENT_SHIFT + 1 && required > 0);
     capacity = 1;
   }
   else {
-    mi_assert_internal(required == 0);
+    mi_assert_internal(required == 0 && page_alignment == 0);
     size_t page_size = (size_t)1 << page_shift;
     capacity = MI_SEGMENT_SIZE / page_size;
     mi_assert_internal(MI_SEGMENT_SIZE % page_size == 0);
@@ -575,108 +591,44 @@ static mi_segment_t* mi_segment_init(mi_segment_t* segment, size_t required, mi_
   }
   size_t info_size;
   size_t pre_size;
-  size_t segment_size = mi_segment_size(capacity, required, &pre_size, &info_size);
-  mi_assert_internal(segment_size >= required);
+  const size_t init_segment_size = mi_segment_calculate_sizes(capacity, required, &pre_size, &info_size);
+  mi_assert_internal(init_segment_size >= required);
 
   // Initialize parameters
-  const bool eager_delayed = (page_kind <= MI_PAGE_MEDIUM && tld->count < (size_t)mi_option_get(mi_option_eager_commit_delay));
+  const bool eager_delayed = (page_kind <= MI_PAGE_MEDIUM &&          // don't delay for large objects
+                              // !_mi_os_has_overcommit() &&          // never delay on overcommit systems
+                              _mi_current_thread_count() > 1 &&       // do not delay for the first N threads
+                              tld->peak_count < (size_t)mi_option_get(mi_option_eager_commit_delay));
   const bool eager  = !eager_delayed && mi_option_is_enabled(mi_option_eager_commit);
-  bool commit = eager; // || (page_kind >= MI_PAGE_LARGE);
-  bool pages_still_good = false;
-  bool is_zero = false;
+  const bool init_commit = eager; // || (page_kind >= MI_PAGE_LARGE);
 
-  // Try to get it from our thread local cache first
-  if (segment != NULL) {
-    // came from cache
-    mi_assert_internal(segment->segment_size == segment_size);
-    if (page_kind <= MI_PAGE_MEDIUM && segment->page_kind == page_kind && segment->segment_size == segment_size) {
-      pages_still_good = true;
-    }
-    else
-    {
-      if (MI_SECURE!=0) {
-        mi_assert_internal(!segment->mem_is_pinned);
-        mi_segment_protect(segment, false, tld->os); // reset protection if the page kind differs
-      }
-      // different page kinds; unreset any reset pages, and unprotect
-      // TODO: optimize cache pop to return fitting pages if possible?
-      for (size_t i = 0; i < segment->capacity; i++) {
-        mi_page_t* page = &segment->pages[i];
-        if (page->is_reset) {
-          if (!commit && mi_option_is_enabled(mi_option_reset_decommits)) {
-            page->is_reset = false;
-          }
-          else {
-            mi_page_unreset(segment, page, 0, tld);  // todo: only unreset the part that was reset? (instead of the full page)
-          }
-        }
-      }
-      // ensure the initial info is committed
-      if (segment->capacity < capacity) {
-        bool commit_zero = false;
-        bool ok = _mi_mem_commit(segment, pre_size, &commit_zero, tld->os);
-        if (commit_zero) is_zero = true;
-        if (!ok) {
-          return NULL;
-        }
-      }
-    }
-  }
-  else {
-    // Allocate the segment from the OS
-    size_t memid;
-    bool   mem_large = (!eager_delayed && (MI_SECURE==0)); // only allow large OS pages once we are no longer lazy
-    bool   is_pinned = false;
-    segment = (mi_segment_t*)_mi_mem_alloc_aligned(segment_size, MI_SEGMENT_SIZE, &commit, &mem_large, &is_pinned, &is_zero, &memid, os_tld);
-    if (segment == NULL) return NULL;  // failed to allocate
-    if (!commit) {
-      // ensure the initial info is committed
-      mi_assert_internal(!mem_large && !is_pinned);
-      bool commit_zero = false;
-      bool ok = _mi_mem_commit(segment, pre_size, &commit_zero, tld->os);
-      if (commit_zero) is_zero = true;
-      if (!ok) {
-        // commit failed; we cannot touch the memory: free the segment directly and return `NULL`
-        _mi_mem_free(segment, MI_SEGMENT_SIZE, memid, false, false, os_tld);
-        return NULL;  
-      }
-    }
-    segment->memid = memid;
-    segment->mem_is_pinned = (mem_large || is_pinned);
-    segment->mem_is_committed = commit;    
-    mi_segments_track_size((long)segment_size, tld);
-  }
+  // Allocate the segment from the OS (segment_size can change due to alignment)
+  mi_segment_t* segment = mi_segment_os_alloc(eager_delayed, page_alignment, req_arena_id, pre_size, info_size, init_commit, init_segment_size, tld, os_tld);
+  if (segment == NULL) return NULL;
   mi_assert_internal(segment != NULL && (uintptr_t)segment % MI_SEGMENT_SIZE == 0);
-  mi_assert_internal(segment->mem_is_pinned ? segment->mem_is_committed : true);  
-  mi_atomic_store_ptr_release(mi_segment_t, &segment->abandoned_next, NULL);  // tsan
-  if (!pages_still_good) {
-    // zero the segment info (but not the `mem` fields)
-    ptrdiff_t ofs = offsetof(mi_segment_t, next);
-    memset((uint8_t*)segment + ofs, 0, info_size - ofs);
-
-    // initialize pages info
-    for (uint8_t i = 0; i < capacity; i++) {
-      segment->pages[i].segment_idx = i;
-      segment->pages[i].is_reset = false;
-      segment->pages[i].is_committed = commit;
-      segment->pages[i].is_zero_init = is_zero;
-    }
-  }
-  else {
-    // zero the segment info but not the pages info (and mem fields)
-    ptrdiff_t ofs = offsetof(mi_segment_t, next);
-    memset((uint8_t*)segment + ofs, 0, offsetof(mi_segment_t,pages) - ofs);
+  mi_assert_internal(segment->memid.is_pinned ? segment->memid.initially_committed : true);
+
+  // zero the segment info (but not the `mem` fields)
+  ptrdiff_t ofs = offsetof(mi_segment_t, next);
+  _mi_memzero((uint8_t*)segment + ofs, info_size - ofs);
+
+  // initialize pages info
+  const bool is_huge = (page_kind == MI_PAGE_HUGE);
+  for (size_t i = 0; i < capacity; i++) {
+    mi_assert_internal(i <= 255);
+    segment->pages[i].segment_idx = (uint8_t)i;
+    segment->pages[i].is_committed = segment->memid.initially_committed;
+    segment->pages[i].is_zero_init = segment->memid.initially_zero;
+    segment->pages[i].is_huge = is_huge;
   }
 
   // initialize
   segment->page_kind  = page_kind;
   segment->capacity   = capacity;
   segment->page_shift = page_shift;
-  segment->segment_size = segment_size;
   segment->segment_info_size = pre_size;
   segment->thread_id  = _mi_thread_id();
   segment->cookie = _mi_ptr_cookie(segment);
-  // _mi_stat_increase(&tld->stats->page_committed, segment->segment_info_size);
 
   // set protection
   mi_segment_protect(segment, true, tld->os);
@@ -686,21 +638,16 @@ static mi_segment_t* mi_segment_init(mi_segment_t* segment, size_t required, mi_
     mi_segment_insert_in_free_queue(segment, tld);
   }
 
-  //fprintf(stderr,"mimalloc: alloc segment at %p\n", (void*)segment);
   return segment;
 }
 
-static mi_segment_t* mi_segment_alloc(size_t required, mi_page_kind_t page_kind, size_t page_shift, mi_segments_tld_t* tld, mi_os_tld_t* os_tld) {
-  return mi_segment_init(NULL, required, page_kind, page_shift, tld, os_tld);
-}
 
 static void mi_segment_free(mi_segment_t* segment, bool force, mi_segments_tld_t* tld) {
-  UNUSED(force);
+  MI_UNUSED(force);
   mi_assert(segment != NULL);
-  // note: don't reset pages even on abandon as the whole segment is freed? (and ready for reuse)
-  bool force_reset = (force && mi_option_is_enabled(mi_option_abandoned_page_reset));
-  mi_pages_reset_remove_all_in_segment(segment, force_reset, tld);
-  mi_segment_remove_from_free_queue(segment,tld);
+  // don't purge as we are freeing now
+  mi_segment_remove_all_purges(segment, false /* don't force as we are about to free */, tld);
+  mi_segment_remove_from_free_queue(segment, tld);
 
   mi_assert_expensive(!mi_segment_queue_contains(&tld->small_free, segment));
   mi_assert_expensive(!mi_segment_queue_contains(&tld->medium_free, segment));
@@ -708,13 +655,8 @@ static void mi_segment_free(mi_segment_t* segment, bool force, mi_segments_tld_t
   mi_assert(segment->prev == NULL);
   _mi_stat_decrease(&tld->stats->page_committed, segment->segment_info_size);
 
-  if (!force && mi_segment_cache_push(segment, tld)) {
-    // it is put in our cache
-  }
-  else {
-    // otherwise return it to the OS
-    mi_segment_os_free(segment, segment->segment_size, tld);
-  }
+  // return it to the OS
+  mi_segment_os_free(segment, segment->segment_size, tld);
 }
 
 /* -----------------------------------------------------------
@@ -729,35 +671,15 @@ static bool mi_segment_has_free(const mi_segment_t* segment) {
 static bool mi_segment_page_claim(mi_segment_t* segment, mi_page_t* page, mi_segments_tld_t* tld) {
   mi_assert_internal(_mi_page_segment(page) == segment);
   mi_assert_internal(!page->segment_in_use);
-  mi_pages_reset_remove(page, tld);
+  mi_page_purge_remove(page, tld);
+
   // check commit
-  if (!page->is_committed) {
-    mi_assert_internal(!segment->mem_is_pinned);
-    mi_assert_internal(!page->is_reset);    
-    size_t psize;
-    uint8_t* start = mi_segment_raw_page_start(segment, page, &psize);
-    bool is_zero = false;
-    const size_t gsize = (MI_SECURE >= 2 ? _mi_os_page_size() : 0);
-    bool ok = _mi_mem_commit(start, psize + gsize, &is_zero, tld->os);
-    if (!ok) return false; // failed to commit!
-    if (gsize > 0) { mi_segment_protect_range(start + psize, gsize, true); }
-    if (is_zero) { page->is_zero_init = true; }
-    page->is_committed = true;
-  }
+  if (!mi_page_ensure_committed(segment, page, tld)) return false;
+
   // set in-use before doing unreset to prevent delayed reset
   page->segment_in_use = true;
   segment->used++;
-  // check reset
-  if (page->is_reset) {
-    mi_assert_internal(!segment->mem_is_pinned);
-    bool ok = mi_page_unreset(segment, page, 0, tld); 
-    if (!ok) {
-      page->segment_in_use = false;
-      segment->used--;
-      return false;
-    }
-  }
-  mi_assert_internal(page->segment_in_use);
+  mi_assert_internal(page->segment_in_use && page->is_committed && page->used==0 && !mi_pages_purge_contains(page,tld));
   mi_assert_internal(segment->used <= segment->capacity);
   if (segment->used == segment->capacity && segment->page_kind <= MI_PAGE_MEDIUM) {
     // if no more free pages, remove from the queue
@@ -775,7 +697,7 @@ static bool mi_segment_page_claim(mi_segment_t* segment, mi_page_t* page, mi_seg
 static void mi_segment_abandon(mi_segment_t* segment, mi_segments_tld_t* tld);
 
 // clear page data; can be called on abandoned segments
-static void mi_segment_page_clear(mi_segment_t* segment, mi_page_t* page, bool allow_reset, mi_segments_tld_t* tld)
+static void mi_segment_page_clear(mi_segment_t* segment, mi_page_t* page, mi_segments_tld_t* tld)
 {
   mi_assert_internal(page->segment_in_use);
   mi_assert_internal(mi_page_all_free(page));
@@ -786,35 +708,30 @@ static void mi_segment_page_clear(mi_segment_t* segment, mi_page_t* page, bool a
   _mi_stat_decrease(&tld->stats->page_committed, inuse);
   _mi_stat_decrease(&tld->stats->pages, 1);
 
-  // calculate the used size from the raw (non-aligned) start of the page
-  //size_t pre_size;
-  //_mi_segment_page_start(segment, page, page->block_size, NULL, &pre_size);
-  //size_t used_size = pre_size + (page->capacity * page->block_size);
-
   page->is_zero_init = false;
   page->segment_in_use = false;
 
-  // reset the page memory to reduce memory pressure?
-  // note: must come after setting `segment_in_use` to false but before block_size becomes 0
-  //mi_page_reset(segment, page, 0 /*used_size*/, tld);
-
-  // zero the page data, but not the segment fields and capacity, and block_size (for page size calculations)
-  uint32_t block_size = page->xblock_size;
+  // zero the page data, but not the segment fields and capacity, page start, and block_size (for page size calculations)
+  size_t block_size = page->block_size;
+  uint8_t block_size_shift = page->block_size_shift;
+  uint8_t heap_tag = page->heap_tag;
+  uint8_t* page_start = page->page_start;
   uint16_t capacity = page->capacity;
   uint16_t reserved = page->reserved;
   ptrdiff_t ofs = offsetof(mi_page_t,capacity);
-  memset((uint8_t*)page + ofs, 0, sizeof(*page) - ofs);
+  _mi_memzero((uint8_t*)page + ofs, sizeof(*page) - ofs);
   page->capacity = capacity;
   page->reserved = reserved;
-  page->xblock_size = block_size;
+  page->block_size = block_size;
+  page->block_size_shift = block_size_shift;
+  page->heap_tag = heap_tag;
+  page->page_start = page_start;
   segment->used--;
 
-  // add to the free page list for reuse/reset
-  if (allow_reset) {
-    mi_pages_reset_add(segment, page, tld);
-  }
+  // schedule purge
+  mi_segment_schedule_purge(segment, page, tld);
 
-  page->capacity = 0;  // after reset these can be zero'd now
+  page->capacity = 0;  // after purge these can be zero'd now
   page->reserved = 0;
 }
 
@@ -823,10 +740,10 @@ void _mi_segment_page_free(mi_page_t* page, bool force, mi_segments_tld_t* tld)
   mi_assert(page != NULL);
   mi_segment_t* segment = _mi_page_segment(page);
   mi_assert_expensive(mi_segment_is_valid(segment,tld));
-  mi_reset_delayed(tld);
+  mi_pages_try_purge(false /*force?*/, tld);
 
   // mark it as free now
-  mi_segment_page_clear(segment, page, true, tld);
+  mi_segment_page_clear(segment, page, tld);
 
   if (segment->used == 0) {
     // no more used pages; remove from the free list and free the segment
@@ -838,9 +755,11 @@ void _mi_segment_page_free(mi_page_t* page, bool force, mi_segments_tld_t* tld)
       mi_segment_abandon(segment,tld);
     }
     else if (segment->used + 1 == segment->capacity) {
-      mi_assert_internal(segment->page_kind <= MI_PAGE_MEDIUM); // for now we only support small and medium pages
-      // move back to segments  free list
-      mi_segment_insert_in_free_queue(segment,tld);
+      mi_assert_internal(segment->page_kind <= MI_PAGE_MEDIUM); // large and huge pages are always the single page in a segment
+      if (segment->page_kind <= MI_PAGE_MEDIUM) {
+        // move back to segments  free list
+        mi_segment_insert_in_free_queue(segment,tld);
+      }
     }
   }
 }
@@ -852,171 +771,21 @@ Abandonment
 When threads terminate, they can leave segments with
 live blocks (reached through other threads). Such segments
 are "abandoned" and will be reclaimed by other threads to
-reuse their pages and/or free them eventually
-
-We maintain a global list of abandoned segments that are
-reclaimed on demand. Since this is shared among threads
-the implementation needs to avoid the A-B-A problem on
-popping abandoned segments: <https://en.wikipedia.org/wiki/ABA_problem>
-We use tagged pointers to avoid accidentially identifying
-reused segments, much like stamped references in Java.
-Secondly, we maintain a reader counter to avoid resetting
-or decommitting segments that have a pending read operation.
-
-Note: the current implementation is one possible design;
-another way might be to keep track of abandoned segments
-in the regions. This would have the advantage of keeping
-all concurrent code in one place and not needing to deal
-with ABA issues. The drawback is that it is unclear how to
-scan abandoned segments efficiently in that case as they
-would be spread among all other segments in the regions.
------------------------------------------------------------ */
+reuse their pages and/or free them eventually. The
+`thread_id` of such segments is 0.
 
-// Use the bottom 20-bits (on 64-bit) of the aligned segment pointers
-// to put in a tag that increments on update to avoid the A-B-A problem.
-#define MI_TAGGED_MASK   MI_SEGMENT_MASK
-typedef uintptr_t        mi_tagged_segment_t;
+When a block is freed in an abandoned segment, the segment
+is reclaimed into that thread.
 
-static mi_segment_t* mi_tagged_segment_ptr(mi_tagged_segment_t ts) {
-  return (mi_segment_t*)(ts & ~MI_TAGGED_MASK);
-}
-
-static mi_tagged_segment_t mi_tagged_segment(mi_segment_t* segment, mi_tagged_segment_t ts) {
-  mi_assert_internal(((uintptr_t)segment & MI_TAGGED_MASK) == 0);
-  uintptr_t tag = ((ts & MI_TAGGED_MASK) + 1) & MI_TAGGED_MASK;
-  return ((uintptr_t)segment | tag);
-}
-
-// This is a list of visited abandoned pages that were full at the time.
-// this list migrates to `abandoned` when that becomes NULL. The use of
-// this list reduces contention and the rate at which segments are visited.
-static mi_decl_cache_align _Atomic(mi_segment_t*)       abandoned_visited; // = NULL
-
-// The abandoned page list (tagged as it supports pop)
-static mi_decl_cache_align _Atomic(mi_tagged_segment_t) abandoned;         // = NULL
-
-// Maintain these for debug purposes (these counts may be a bit off)
-static mi_decl_cache_align _Atomic(uintptr_t)           abandoned_count; 
-static mi_decl_cache_align _Atomic(uintptr_t)           abandoned_visited_count;
-
-// We also maintain a count of current readers of the abandoned list
-// in order to prevent resetting/decommitting segment memory if it might
-// still be read.
-static mi_decl_cache_align _Atomic(uintptr_t)           abandoned_readers; // = 0
-
-// Push on the visited list
-static void mi_abandoned_visited_push(mi_segment_t* segment) {
-  mi_assert_internal(segment->thread_id == 0);
-  mi_assert_internal(mi_atomic_load_ptr_relaxed(mi_segment_t,&segment->abandoned_next) == NULL);
-  mi_assert_internal(segment->next == NULL && segment->prev == NULL);
-  mi_assert_internal(segment->used > 0);
-  mi_segment_t* anext = mi_atomic_load_ptr_relaxed(mi_segment_t, &abandoned_visited);
-  do {
-    mi_atomic_store_ptr_release(mi_segment_t, &segment->abandoned_next, anext);
-  } while (!mi_atomic_cas_ptr_weak_release(mi_segment_t, &abandoned_visited, &anext, segment));
-  mi_atomic_increment_relaxed(&abandoned_visited_count);
-}
-
-// Move the visited list to the abandoned list.
-static bool mi_abandoned_visited_revisit(void)
-{
-  // quick check if the visited list is empty
-  if (mi_atomic_load_ptr_relaxed(mi_segment_t, &abandoned_visited) == NULL) return false;
-
-  // grab the whole visited list
-  mi_segment_t* first = mi_atomic_exchange_ptr_acq_rel(mi_segment_t, &abandoned_visited, NULL);
-  if (first == NULL) return false;
-
-  // first try to swap directly if the abandoned list happens to be NULL
-  mi_tagged_segment_t afirst;
-  mi_tagged_segment_t ts = mi_atomic_load_relaxed(&abandoned);
-  if (mi_tagged_segment_ptr(ts)==NULL) {
-    uintptr_t count = mi_atomic_load_relaxed(&abandoned_visited_count);
-    afirst = mi_tagged_segment(first, ts);
-    if (mi_atomic_cas_strong_acq_rel(&abandoned, &ts, afirst)) {
-      mi_atomic_add_relaxed(&abandoned_count, count);
-      mi_atomic_sub_relaxed(&abandoned_visited_count, count);
-      return true;
-    }
-  }
-
-  // find the last element of the visited list: O(n)
-  mi_segment_t* last = first;
-  mi_segment_t* next;
-  while ((next = mi_atomic_load_ptr_relaxed(mi_segment_t, &last->abandoned_next)) != NULL) {
-    last = next;
-  }
-
-  // and atomically prepend to the abandoned list
-  // (no need to increase the readers as we don't access the abandoned segments)
-  mi_tagged_segment_t anext = mi_atomic_load_relaxed(&abandoned);
-  uintptr_t count;
-  do {
-    count = mi_atomic_load_relaxed(&abandoned_visited_count);
-    mi_atomic_store_ptr_release(mi_segment_t, &last->abandoned_next, mi_tagged_segment_ptr(anext));
-    afirst = mi_tagged_segment(first, anext);
-  } while (!mi_atomic_cas_weak_release(&abandoned, &anext, afirst));
-  mi_atomic_add_relaxed(&abandoned_count, count);
-  mi_atomic_sub_relaxed(&abandoned_visited_count, count);
-  return true;
-}
-
-// Push on the abandoned list.
-static void mi_abandoned_push(mi_segment_t* segment) {
-  mi_assert_internal(segment->thread_id == 0);
-  mi_assert_internal(mi_atomic_load_ptr_relaxed(mi_segment_t, &segment->abandoned_next) == NULL);
-  mi_assert_internal(segment->next == NULL && segment->prev == NULL);
-  mi_assert_internal(segment->used > 0);
-  mi_tagged_segment_t next;
-  mi_tagged_segment_t ts = mi_atomic_load_relaxed(&abandoned);
-  do {
-    mi_atomic_store_ptr_release(mi_segment_t, &segment->abandoned_next, mi_tagged_segment_ptr(ts));
-    next = mi_tagged_segment(segment, ts);
-  } while (!mi_atomic_cas_weak_release(&abandoned, &ts, next));
-  mi_atomic_increment_relaxed(&abandoned_count);
-}
+Moreover, if threads are looking for a fresh segment, they
+will first consider abondoned segments -- these can be found
+by scanning the arena memory
+(segments outside arena memoryare only reclaimed by a free).
+----------------------------------------------------------- */
 
-// Wait until there are no more pending reads on segments that used to be in the abandoned list
+// legacy: Wait until there are no more pending reads on segments that used to be in the abandoned list
 void _mi_abandoned_await_readers(void) {
-  uintptr_t n;
-  do {
-    n = mi_atomic_load_acquire(&abandoned_readers);
-    if (n != 0) mi_atomic_yield();
-  } while (n != 0);
-}
-
-// Pop from the abandoned list
-static mi_segment_t* mi_abandoned_pop(void) {
-  mi_segment_t* segment;
-  // Check efficiently if it is empty (or if the visited list needs to be moved)
-  mi_tagged_segment_t ts = mi_atomic_load_relaxed(&abandoned);
-  segment = mi_tagged_segment_ptr(ts);
-  if (mi_likely(segment == NULL)) {
-    if (mi_likely(!mi_abandoned_visited_revisit())) { // try to swap in the visited list on NULL
-      return NULL;
-    }
-  }
-
-  // Do a pop. We use a reader count to prevent
-  // a segment to be decommitted while a read is still pending,
-  // and a tagged pointer to prevent A-B-A link corruption.
-  // (this is called from `region.c:_mi_mem_free` for example)
-  mi_atomic_increment_relaxed(&abandoned_readers);  // ensure no segment gets decommitted
-  mi_tagged_segment_t next = 0;
-  ts = mi_atomic_load_acquire(&abandoned);
-  do {
-    segment = mi_tagged_segment_ptr(ts);
-    if (segment != NULL) {
-      mi_segment_t* anext = mi_atomic_load_ptr_relaxed(mi_segment_t, &segment->abandoned_next);
-      next = mi_tagged_segment(anext, ts); // note: reads the segment's `abandoned_next` field so should not be decommitted
-    }
-  } while (segment != NULL && !mi_atomic_cas_weak_acq_rel(&abandoned, &ts, next));
-  mi_atomic_decrement_relaxed(&abandoned_readers);  // release reader lock
-  if (segment != NULL) {
-    mi_atomic_store_ptr_release(mi_segment_t, &segment->abandoned_next, NULL);
-    mi_atomic_decrement_relaxed(&abandoned_count);
-  }
-  return segment;
+  // nothing needed
 }
 
 /* -----------------------------------------------------------
@@ -1026,22 +795,27 @@ static mi_segment_t* mi_abandoned_pop(void) {
 static void mi_segment_abandon(mi_segment_t* segment, mi_segments_tld_t* tld) {
   mi_assert_internal(segment->used == segment->abandoned);
   mi_assert_internal(segment->used > 0);
-  mi_assert_internal(mi_atomic_load_ptr_relaxed(mi_segment_t, &segment->abandoned_next) == NULL);
   mi_assert_expensive(mi_segment_is_valid(segment, tld));
 
+  // Potentially force purge. Only abandoned segments in arena memory can be
+  // reclaimed without a free so if a segment is not from an arena we force purge here to be conservative.
+  mi_pages_try_purge(false /*force?*/,tld);
+  const bool force_purge = (segment->memid.memkind != MI_MEM_ARENA) ||  mi_option_is_enabled(mi_option_abandoned_page_purge);
+  mi_segment_remove_all_purges(segment, force_purge, tld);
+
   // remove the segment from the free page queue if needed
-  mi_reset_delayed(tld);
-  mi_pages_reset_remove_all_in_segment(segment, mi_option_is_enabled(mi_option_abandoned_page_reset), tld);
   mi_segment_remove_from_free_queue(segment, tld);
   mi_assert_internal(segment->next == NULL && segment->prev == NULL);
 
   // all pages in the segment are abandoned; add it to the abandoned list
   _mi_stat_increase(&tld->stats->segments_abandoned, 1);
   mi_segments_track_size(-((long)segment->segment_size), tld);
-  segment->thread_id = 0;
   segment->abandoned_visits = 0;
-  mi_atomic_store_ptr_release(mi_segment_t, &segment->abandoned_next, NULL);
-  mi_abandoned_push(segment);
+  if (segment->was_reclaimed) {
+    tld->reclaim_count--;
+    segment->was_reclaimed = false;
+  }
+  _mi_arena_segment_mark_abandoned(segment);
 }
 
 void _mi_segment_page_abandon(mi_page_t* page, mi_segments_tld_t* tld) {
@@ -1049,7 +823,7 @@ void _mi_segment_page_abandon(mi_page_t* page, mi_segments_tld_t* tld) {
   mi_assert_internal(mi_page_thread_free_flag(page)==MI_NEVER_DELAYED_FREE);
   mi_assert_internal(mi_page_heap(page) == NULL);
   mi_segment_t* segment = _mi_page_segment(page);
-  mi_assert_expensive(!mi_pages_reset_contains(page, tld));
+  mi_assert_expensive(!mi_pages_purge_contains(page, tld));
   mi_assert_expensive(mi_segment_is_valid(segment, tld));
   segment->abandoned++;
   _mi_stat_increase(&tld->stats->pages_abandoned, 1);
@@ -1067,7 +841,6 @@ void _mi_segment_page_abandon(mi_page_t* page, mi_segments_tld_t* tld) {
 // Possibly clear pages and check if free space is available
 static bool mi_segment_check_free(mi_segment_t* segment, size_t block_size, bool* all_pages_free)
 {
-  mi_assert_internal(block_size < MI_HUGE_BLOCK_SIZE);
   bool has_page = false;
   size_t pages_used = 0;
   size_t pages_used_empty = 0;
@@ -1083,7 +856,7 @@ static bool mi_segment_check_free(mi_segment_t* segment, size_t block_size, bool
         pages_used_empty++;
         has_page = true;
       }
-      else if (page->xblock_size == block_size && mi_page_has_any_available(page)) {
+      else if (mi_page_block_size(page) == block_size && mi_page_has_any_available(page)) {
         // a page has available free blocks of the right size
         has_page = true;
       }
@@ -1104,11 +877,13 @@ static bool mi_segment_check_free(mi_segment_t* segment, size_t block_size, bool
 // Reclaim a segment; returns NULL if the segment was freed
 // set `right_page_reclaimed` to `true` if it reclaimed a page of the right `block_size` that was not full.
 static mi_segment_t* mi_segment_reclaim(mi_segment_t* segment, mi_heap_t* heap, size_t requested_block_size, bool* right_page_reclaimed, mi_segments_tld_t* tld) {
-  mi_assert_internal(mi_atomic_load_ptr_relaxed(mi_segment_t, &segment->abandoned_next) == NULL);
   if (right_page_reclaimed != NULL) { *right_page_reclaimed = false; }
-
-  segment->thread_id = _mi_thread_id();
+  // can be 0 still with abandoned_next, or already a thread id for segments outside an arena that are reclaimed on a free.
+  mi_assert_internal(mi_atomic_load_relaxed(&segment->thread_id) == 0 || mi_atomic_load_relaxed(&segment->thread_id) == _mi_thread_id());
+  mi_atomic_store_release(&segment->thread_id, _mi_thread_id());
   segment->abandoned_visits = 0;
+  segment->was_reclaimed = true;
+  tld->reclaim_count++;
   mi_segments_track_size((long)segment->segment_size, tld);
   mi_assert_internal(segment->next == NULL && segment->prev == NULL);
   mi_assert_expensive(mi_segment_is_valid(segment, tld));
@@ -1117,7 +892,6 @@ static mi_segment_t* mi_segment_reclaim(mi_segment_t* segment, mi_heap_t* heap,
   for (size_t i = 0; i < segment->capacity; i++) {
     mi_page_t* page = &segment->pages[i];
     if (page->segment_in_use) {
-      mi_assert_internal(!page->is_reset);
       mi_assert_internal(page->is_committed);
       mi_assert_internal(mi_page_not_in_queue(page, tld));
       mi_assert_internal(mi_page_thread_free_flag(page)==MI_NEVER_DELAYED_FREE);
@@ -1126,26 +900,32 @@ static mi_segment_t* mi_segment_reclaim(mi_segment_t* segment, mi_heap_t* heap,
       mi_assert(page->next == NULL);
       _mi_stat_decrease(&tld->stats->pages_abandoned, 1);
       // set the heap again and allow heap thread delayed free again.
-      mi_page_set_heap(page, heap);
+      mi_heap_t* target_heap = _mi_heap_by_tag(heap, page->heap_tag);  // allow custom heaps to separate objects
+      if (target_heap == NULL) {
+        target_heap = heap;
+        _mi_error_message(EINVAL, "page with tag %u cannot be reclaimed by a heap with the same tag (using %u instead)\n", page->heap_tag, heap->tag );
+      }
+      mi_page_set_heap(page, target_heap);
       _mi_page_use_delayed_free(page, MI_USE_DELAYED_FREE, true); // override never (after heap is set)
-      // TODO: should we not collect again given that we just collected in `check_free`?
       _mi_page_free_collect(page, false); // ensure used count is up to date
       if (mi_page_all_free(page)) {
         // if everything free already, clear the page directly
-        mi_segment_page_clear(segment, page, true, tld);  // reset is ok now
+        mi_segment_page_clear(segment, page, tld);  // reset is ok now
       }
       else {
         // otherwise reclaim it into the heap
-        _mi_page_reclaim(heap, page);
-        if (requested_block_size == page->xblock_size && mi_page_has_any_available(page)) {
+        _mi_page_reclaim(target_heap, page);
+        if (requested_block_size == mi_page_block_size(page) && mi_page_has_any_available(page) && heap == target_heap) {
           if (right_page_reclaimed != NULL) { *right_page_reclaimed = true; }
         }
       }
     }
-    else if (page->is_committed && !page->is_reset) {  // not in-use, and not reset yet
+    /* expired
+    else if (page->is_committed) {  // not in-use, and not reset yet
       // note: do not reset as this includes pages that were not touched before
-      // mi_pages_reset_add(segment, page, tld);
+      // mi_pages_purge_add(segment, page, tld);
     }
+    */
   }
   mi_assert_internal(segment->abandoned == 0);
   if (segment->used == 0) {
@@ -1161,21 +941,55 @@ static mi_segment_t* mi_segment_reclaim(mi_segment_t* segment, mi_heap_t* heap,
   }
 }
 
+// attempt to reclaim a particular segment (called from multi threaded free `alloc.c:mi_free_block_mt`)
+bool _mi_segment_attempt_reclaim(mi_heap_t* heap, mi_segment_t* segment) {
+  if (mi_atomic_load_relaxed(&segment->thread_id) != 0) return false;  // it is not abandoned
+  // don't reclaim more from a free than half the current segments
+  // this is to prevent a pure free-ing thread to start owning too many segments
+  if (heap->tld->segments.reclaim_count * 2 > heap->tld->segments.count) return false;
+  if (_mi_arena_segment_clear_abandoned(segment)) {  // atomically unabandon
+    mi_segment_t* res = mi_segment_reclaim(segment, heap, 0, NULL, &heap->tld->segments);
+    mi_assert_internal(res == segment);
+    return (res != NULL);
+  }
+  return false;
+}
 
 void _mi_abandoned_reclaim_all(mi_heap_t* heap, mi_segments_tld_t* tld) {
   mi_segment_t* segment;
-  while ((segment = mi_abandoned_pop()) != NULL) {
+  mi_arena_field_cursor_t current; _mi_arena_field_cursor_init(heap, &current);
+  while ((segment = _mi_arena_segment_clear_abandoned_next(&current)) != NULL) {
     mi_segment_reclaim(segment, heap, 0, NULL, tld);
   }
 }
 
+static long mi_segment_get_reclaim_tries(void) {
+  // limit the tries to 10% (default) of the abandoned segments with at least 8 and at most 1024 tries.
+  const size_t perc = (size_t)mi_option_get_clamp(mi_option_max_segment_reclaim, 0, 100);
+  if (perc <= 0) return 0;
+  const size_t total_count = _mi_arena_segment_abandoned_count();
+  if (total_count == 0) return 0;
+  const size_t relative_count = (total_count > 10000 ? (total_count / 100) * perc : (total_count * perc) / 100); // avoid overflow
+  long max_tries = (long)(relative_count <= 1 ? 1 : (relative_count > 1024 ? 1024 : relative_count));
+  if (max_tries < 8 && total_count > 8) { max_tries = 8;  }
+  return max_tries;
+}
+
 static mi_segment_t* mi_segment_try_reclaim(mi_heap_t* heap, size_t block_size, mi_page_kind_t page_kind, bool* reclaimed, mi_segments_tld_t* tld)
 {
   *reclaimed = false;
+  long max_tries = mi_segment_get_reclaim_tries();
+  if (max_tries <= 0) return NULL;
+
   mi_segment_t* segment;
-  int max_tries = 8;     // limit the work to bound allocation times
-  while ((max_tries-- > 0) && ((segment = mi_abandoned_pop()) != NULL)) {
+  mi_arena_field_cursor_t current; _mi_arena_field_cursor_init(heap, &current);
+  while ((max_tries-- > 0) && ((segment = _mi_arena_segment_clear_abandoned_next(&current)) != NULL))
+  {
     segment->abandoned_visits++;
+    // todo: should we respect numa affinity for abondoned reclaim? perhaps only for the first visit?
+    // todo: an arena exclusive heap will potentially visit many abandoned unsuitable segments and use many tries
+    // Perhaps we can skip non-suitable ones in a better way?
+    bool is_suitable = _mi_heap_memid_is_suitable(heap, segment->memid);
     bool all_pages_free;
     bool has_page = mi_segment_check_free(segment,block_size,&all_pages_free); // try to free up pages (due to concurrent frees)
     if (all_pages_free) {
@@ -1186,19 +1000,20 @@ static mi_segment_t* mi_segment_try_reclaim(mi_heap_t* heap, size_t block_size,
       // freeing but that would violate some invariants temporarily)
       mi_segment_reclaim(segment, heap, 0, NULL, tld);
     }
-    else if (has_page && segment->page_kind == page_kind) {
+    else if (has_page && segment->page_kind == page_kind && is_suitable) {
       // found a free page of the right kind, or page of the right block_size with free space
       // we return the result of reclaim (which is usually `segment`) as it might free
       // the segment due to concurrent frees (in which case `NULL` is returned).
       return mi_segment_reclaim(segment, heap, block_size, reclaimed, tld);
     }
-    else if (segment->abandoned_visits >= 3) {
+    else if (segment->abandoned_visits >= 3 && is_suitable) {
       // always reclaim on 3rd visit to limit the list length.
       mi_segment_reclaim(segment, heap, 0, NULL, tld);
     }
     else {
-      // otherwise, push on the visited list so it gets not looked at too quickly again
-      mi_abandoned_visited_push(segment);
+      // otherwise, mark it back as abandoned
+      // todo: reset delayed pages in the segment?
+      _mi_arena_segment_mark_abandoned(segment);
     }
   }
   return NULL;
@@ -1212,16 +1027,12 @@ static mi_segment_t* mi_segment_try_reclaim(mi_heap_t* heap, size_t block_size,
 static mi_segment_t* mi_segment_reclaim_or_alloc(mi_heap_t* heap, size_t block_size, mi_page_kind_t page_kind, size_t page_shift, mi_segments_tld_t* tld, mi_os_tld_t* os_tld)
 {
   mi_assert_internal(page_kind <= MI_PAGE_LARGE);
-  mi_assert_internal(block_size < MI_HUGE_BLOCK_SIZE);
-  // 1. try to get a segment from our cache
-  mi_segment_t* segment = mi_segment_cache_pop(MI_SEGMENT_SIZE, tld);
-  if (segment != NULL) {
-    mi_segment_init(segment, 0, page_kind, page_shift, tld, os_tld);
-    return segment;
-  }
-  // 2. try to reclaim an abandoned segment
+  mi_assert_internal(block_size <= MI_LARGE_OBJ_SIZE_MAX);
+
+  // 1. try to reclaim an abandoned segment
   bool reclaimed;
-  segment = mi_segment_try_reclaim(heap, block_size, page_kind, &reclaimed, tld);
+  mi_segment_t* segment = mi_segment_try_reclaim(heap, block_size, page_kind, &reclaimed, tld);
+  mi_assert_internal(segment == NULL || _mi_arena_memid_is_suitable(segment->memid, heap->arena_id));
   if (reclaimed) {
     // reclaimed the right page right into the heap
     mi_assert_internal(segment != NULL && segment->page_kind == page_kind && page_kind <= MI_PAGE_LARGE);
@@ -1231,8 +1042,8 @@ static mi_segment_t* mi_segment_reclaim_or_alloc(mi_heap_t* heap, size_t block_s
     // reclaimed a segment with empty pages (of `page_kind`) in it
     return segment;
   }
-  // 3. otherwise allocate a fresh segment
-  return mi_segment_alloc(0, page_kind, page_shift, tld, os_tld);
+  // 2. otherwise allocate a fresh segment
+  return mi_segment_alloc(0, page_kind, page_shift, 0, heap->arena_id, tld, os_tld);
 }
 
 
@@ -1260,24 +1071,33 @@ static mi_page_t* mi_segment_page_alloc_in(mi_segment_t* segment, mi_segments_tl
   return mi_segment_find_free(segment, tld);
 }
 
-static mi_page_t* mi_segment_page_alloc(mi_heap_t* heap, size_t block_size, mi_page_kind_t kind, size_t page_shift, mi_segments_tld_t* tld, mi_os_tld_t* os_tld) {
+static mi_page_t* mi_segment_page_try_alloc_in_queue(mi_heap_t* heap, mi_page_kind_t kind, mi_segments_tld_t* tld) {
   // find an available segment the segment free queue
   mi_segment_queue_t* const free_queue = mi_segment_free_queue_of_kind(kind, tld);
-  if (mi_segment_queue_is_empty(free_queue)) {
+  for (mi_segment_t* segment = free_queue->first; segment != NULL; segment = segment->next) {
+    if (_mi_arena_memid_is_suitable(segment->memid, heap->arena_id) && mi_segment_has_free(segment)) {
+      return mi_segment_page_alloc_in(segment, tld);
+    }
+  }
+  return NULL;
+}
+
+static mi_page_t* mi_segment_page_alloc(mi_heap_t* heap, size_t block_size, mi_page_kind_t kind, size_t page_shift, mi_segments_tld_t* tld, mi_os_tld_t* os_tld) {
+  mi_page_t* page = mi_segment_page_try_alloc_in_queue(heap, kind, tld);
+  if (page == NULL) {
     // possibly allocate or reclaim a fresh segment
     mi_segment_t* const segment = mi_segment_reclaim_or_alloc(heap, block_size, kind, page_shift, tld, os_tld);
     if (segment == NULL) return NULL;  // return NULL if out-of-memory (or reclaimed)
-    mi_assert_internal(free_queue->first == segment);
     mi_assert_internal(segment->page_kind==kind);
     mi_assert_internal(segment->used < segment->capacity);
+    mi_assert_internal(_mi_arena_memid_is_suitable(segment->memid, heap->arena_id));
+    page = mi_segment_page_try_alloc_in_queue(heap, kind, tld);  // this should now succeed
   }
-  mi_assert_internal(free_queue->first != NULL);
-  mi_page_t* const page = mi_segment_page_alloc_in(free_queue->first, tld);
   mi_assert_internal(page != NULL);
-#if MI_DEBUG>=2
+  #if MI_DEBUG>=2 && !MI_TRACK_ENABLED // && !MI_TSAN
   // verify it is committed
-  _mi_segment_page_start(_mi_page_segment(page), page, sizeof(void*), NULL, NULL)[0] = 0;
-#endif
+  mi_segment_raw_page_start(_mi_page_segment(page), page, NULL)[0] = 0;
+  #endif
   return page;
 }
 
@@ -1298,24 +1118,45 @@ static mi_page_t* mi_segment_large_page_alloc(mi_heap_t* heap, size_t block_size
   if (segment == NULL) return NULL;
   mi_page_t* page = mi_segment_find_free(segment, tld);
   mi_assert_internal(page != NULL);
-#if MI_DEBUG>=2
-  _mi_segment_page_start(segment, page, sizeof(void*), NULL, NULL)[0] = 0;
+#if MI_DEBUG>=2 && !MI_TRACK_ENABLED // && !MI_TSAN
+  mi_segment_raw_page_start(segment, page, NULL)[0] = 0;
 #endif
   return page;
 }
 
-static mi_page_t* mi_segment_huge_page_alloc(size_t size, mi_segments_tld_t* tld, mi_os_tld_t* os_tld)
+static mi_page_t* mi_segment_huge_page_alloc(size_t size, size_t page_alignment, mi_arena_id_t req_arena_id, mi_segments_tld_t* tld, mi_os_tld_t* os_tld)
 {
-  mi_segment_t* segment = mi_segment_alloc(size, MI_PAGE_HUGE, MI_SEGMENT_SHIFT,tld,os_tld);
+  mi_segment_t* segment = mi_segment_alloc(size, MI_PAGE_HUGE, MI_SEGMENT_SHIFT + 1, page_alignment, req_arena_id, tld, os_tld);
   if (segment == NULL) return NULL;
   mi_assert_internal(mi_segment_page_size(segment) - segment->segment_info_size - (2*(MI_SECURE == 0 ? 0 : _mi_os_page_size())) >= size);
+  #if MI_HUGE_PAGE_ABANDON
   segment->thread_id = 0; // huge pages are immediately abandoned
   mi_segments_track_size(-(long)segment->segment_size, tld);
+  #endif
   mi_page_t* page = mi_segment_find_free(segment, tld);
   mi_assert_internal(page != NULL);
+  mi_assert_internal(page->is_huge);
+
+  // for huge pages we initialize the block_size as we may
+  // overallocate to accommodate large alignments.
+  size_t psize;
+  uint8_t* start = mi_segment_raw_page_start(segment, page, &psize);
+  page->block_size = psize;
+
+  // reset the part of the page that will not be used; this can be quite large (close to MI_SEGMENT_SIZE)
+  if (page_alignment > 0 && segment->allow_decommit && page->is_committed) {
+    uint8_t* aligned_p = (uint8_t*)_mi_align_up((uintptr_t)start, page_alignment);
+    mi_assert_internal(_mi_is_aligned(aligned_p, page_alignment));
+    mi_assert_internal(psize - (aligned_p - start) >= size);
+    uint8_t* decommit_start = start + sizeof(mi_block_t); // for the free list
+    ptrdiff_t decommit_size = aligned_p - decommit_start;
+    _mi_os_reset(decommit_start, decommit_size, os_tld->stats);  // do not decommit as it may be in a region
+  }
+
   return page;
 }
 
+#if MI_HUGE_PAGE_ABANDON
 // free huge block from another thread
 void _mi_segment_huge_page_free(mi_segment_t* segment, mi_page_t* page, mi_block_t* block) {
   // huge page segments are always abandoned and can be freed immediately by any thread
@@ -1326,12 +1167,12 @@ void _mi_segment_huge_page_free(mi_segment_t* segment, mi_page_t* page, mi_block
   // claim it and free
   mi_heap_t* heap = mi_heap_get_default(); // issue #221; don't use the internal get_default_heap as we need to ensure the thread is initialized.
   // paranoia: if this it the last reference, the cas should always succeed
-  uintptr_t expected_tid = 0;
+  size_t expected_tid = 0;
   if (mi_atomic_cas_strong_acq_rel(&segment->thread_id, &expected_tid, heap->thread_id)) {
     mi_block_set_next(page, block, page->free);
     page->free = block;
     page->used--;
-    page->is_zero = false;
+    page->is_zero_init = false;
     mi_assert(page->used == 0);
     mi_tld_t* tld = heap->tld;
     mi_segments_track_size((long)segment->segment_size, &tld->segments);
@@ -1344,27 +1185,52 @@ void _mi_segment_huge_page_free(mi_segment_t* segment, mi_page_t* page, mi_block
 #endif
 }
 
+#else
+// reset memory of a huge block from another thread
+void _mi_segment_huge_page_reset(mi_segment_t* segment, mi_page_t* page, mi_block_t* block) {
+  mi_assert_internal(segment->page_kind == MI_PAGE_HUGE);
+  mi_assert_internal(segment == _mi_page_segment(page));
+  mi_assert_internal(page->used == 1); // this is called just before the free
+  mi_assert_internal(page->free == NULL);
+  if (segment->allow_decommit && page->is_committed) {
+    size_t usize = mi_usable_size(block);
+    if (usize > sizeof(mi_block_t)) {
+      usize = usize - sizeof(mi_block_t);
+      uint8_t* p = (uint8_t*)block + sizeof(mi_block_t);
+      _mi_os_reset(p, usize, &_mi_stats_main);
+    }
+  }
+}
+#endif
+
 /* -----------------------------------------------------------
    Page allocation
 ----------------------------------------------------------- */
 
-mi_page_t* _mi_segment_page_alloc(mi_heap_t* heap, size_t block_size, mi_segments_tld_t* tld, mi_os_tld_t* os_tld) {
+mi_page_t* _mi_segment_page_alloc(mi_heap_t* heap, size_t block_size, size_t page_alignment, mi_segments_tld_t* tld, mi_os_tld_t* os_tld) {
   mi_page_t* page;
-  if (block_size <= MI_SMALL_OBJ_SIZE_MAX) {
+  if mi_unlikely(page_alignment > MI_BLOCK_ALIGNMENT_MAX) {
+    mi_assert_internal(_mi_is_power_of_two(page_alignment));
+    mi_assert_internal(page_alignment >= MI_SEGMENT_SIZE);
+    //mi_assert_internal((MI_SEGMENT_SIZE % page_alignment) == 0);
+    if (page_alignment < MI_SEGMENT_SIZE) { page_alignment = MI_SEGMENT_SIZE; }
+    page = mi_segment_huge_page_alloc(block_size, page_alignment, heap->arena_id, tld, os_tld);
+  }
+  else if (block_size <= MI_SMALL_OBJ_SIZE_MAX) {
     page = mi_segment_small_page_alloc(heap, block_size, tld, os_tld);
   }
   else if (block_size <= MI_MEDIUM_OBJ_SIZE_MAX) {
     page = mi_segment_medium_page_alloc(heap, block_size, tld, os_tld);
   }
-  else if (block_size <= MI_LARGE_OBJ_SIZE_MAX) {
+  else if (block_size <= MI_LARGE_OBJ_SIZE_MAX /* || mi_is_good_fit(block_size, MI_LARGE_PAGE_SIZE - sizeof(mi_segment_t)) */ ) {
     page = mi_segment_large_page_alloc(heap, block_size, tld, os_tld);
   }
   else {
-    page = mi_segment_huge_page_alloc(block_size,tld,os_tld);
+    page = mi_segment_huge_page_alloc(block_size, page_alignment, heap->arena_id, tld, os_tld);
   }
   mi_assert_expensive(page == NULL || mi_segment_is_valid(_mi_page_segment(page),tld));
   mi_assert_internal(page == NULL || (mi_segment_page_size(_mi_page_segment(page)) - (MI_SECURE == 0 ? 0 : _mi_os_page_size())) >= block_size);
-  mi_reset_delayed(tld);
+  // mi_segment_try_purge(tld);
   mi_assert_internal(page == NULL || mi_page_not_in_queue(page, tld));
   return page;
 }
diff --git a/contrib/libs/mimalloc/src/static.c b/contrib/libs/mimalloc/src/static.c
deleted file mode 100644
index 4b3abc285a0a..000000000000
--- a/contrib/libs/mimalloc/src/static.c
+++ /dev/null
@@ -1,39 +0,0 @@
-/* ----------------------------------------------------------------------------
-Copyright (c) 2018-2020, Microsoft Research, Daan Leijen
-This is free software; you can redistribute it and/or modify it under the
-terms of the MIT license. A copy of the license can be found in the file
-"LICENSE" at the root of this distribution.
------------------------------------------------------------------------------*/
-#ifndef _DEFAULT_SOURCE
-#define _DEFAULT_SOURCE
-#endif
-#if defined(__sun)
-// same remarks as os.c for the static's context.
-#undef _XOPEN_SOURCE
-#undef _POSIX_C_SOURCE
-#endif
-
-#include "mimalloc.h"
-#include "mimalloc-internal.h"
-
-// For a static override we create a single object file
-// containing the whole library. If it is linked first
-// it will override all the standard library allocation
-// functions (on Unix's).
-#include "stats.c"
-#include "random.c"
-#include "os.c"
-#include "bitmap.c"
-#include "arena.c"
-#include "region.c"
-#include "segment.c"
-#include "page.c"
-#include "heap.c"
-#include "alloc.c"
-#include "alloc-aligned.c"
-#include "alloc-posix.c"
-#if MI_OSX_ZONE
-#include "alloc-override-osx.c"
-#endif
-#include "init.c"
-#include "options.c"
diff --git a/contrib/libs/mimalloc/src/stats.c b/contrib/libs/mimalloc/src/stats.c
index 7358539aa504..99cf89c5b727 100644
--- a/contrib/libs/mimalloc/src/stats.c
+++ b/contrib/libs/mimalloc/src/stats.c
@@ -5,10 +5,10 @@ terms of the MIT license. A copy of the license can be found in the file
 "LICENSE" at the root of this distribution.
 -----------------------------------------------------------------------------*/
 #include "mimalloc.h"
-#include "mimalloc-internal.h"
-#include "mimalloc-atomic.h"
+#include "mimalloc/internal.h"
+#include "mimalloc/atomic.h"
+#include "mimalloc/prim.h"
 
-#include <stdio.h>  // fputs, stderr
 #include <string.h> // memset
 
 #if defined(_MSC_VER) && (_MSC_VER < 1920)
@@ -21,7 +21,7 @@ terms of the MIT license. A copy of the license can be found in the file
 
 static bool mi_is_in_main(void* stat) {
   return ((uint8_t*)stat >= (uint8_t*)&_mi_stats_main
-         && (uint8_t*)stat < ((uint8_t*)&_mi_stats_main + sizeof(mi_stats_t)));  
+         && (uint8_t*)stat < ((uint8_t*)&_mi_stats_main + sizeof(mi_stats_t)));
 }
 
 static void mi_stat_update(mi_stat_count_t* stat, int64_t amount) {
@@ -51,7 +51,7 @@ static void mi_stat_update(mi_stat_count_t* stat, int64_t amount) {
   }
 }
 
-void _mi_stat_counter_increase(mi_stat_counter_t* stat, size_t amount) {  
+void _mi_stat_counter_increase(mi_stat_counter_t* stat, size_t amount) {
   if (mi_is_in_main(stat)) {
     mi_atomic_addi64_relaxed( &stat->count, 1 );
     mi_atomic_addi64_relaxed( &stat->total, (int64_t)amount );
@@ -77,7 +77,7 @@ static void mi_stat_add(mi_stat_count_t* stat, const mi_stat_count_t* src, int64
   mi_atomic_addi64_relaxed( &stat->allocated, src->allocated * unit);
   mi_atomic_addi64_relaxed( &stat->current, src->current * unit);
   mi_atomic_addi64_relaxed( &stat->freed, src->freed * unit);
-  // peak scores do not work across threads.. 
+  // peak scores do not work across threads..
   mi_atomic_addi64_relaxed( &stat->peak, src->peak * unit);
 }
 
@@ -95,6 +95,7 @@ static void mi_stats_add(mi_stats_t* stats, const mi_stats_t* src) {
   mi_stat_add(&stats->reserved, &src->reserved, 1);
   mi_stat_add(&stats->committed, &src->committed, 1);
   mi_stat_add(&stats->reset, &src->reset, 1);
+  mi_stat_add(&stats->purged, &src->purged, 1);
   mi_stat_add(&stats->page_committed, &src->page_committed, 1);
 
   mi_stat_add(&stats->pages_abandoned, &src->pages_abandoned, 1);
@@ -110,12 +111,13 @@ static void mi_stats_add(mi_stats_t* stats, const mi_stats_t* src) {
   mi_stat_counter_add(&stats->pages_extended, &src->pages_extended, 1);
   mi_stat_counter_add(&stats->mmap_calls, &src->mmap_calls, 1);
   mi_stat_counter_add(&stats->commit_calls, &src->commit_calls, 1);
+  mi_stat_counter_add(&stats->reset_calls, &src->reset_calls, 1);
+  mi_stat_counter_add(&stats->purge_calls, &src->purge_calls, 1);
 
   mi_stat_counter_add(&stats->page_no_retire, &src->page_no_retire, 1);
   mi_stat_counter_add(&stats->searches, &src->searches, 1);
   mi_stat_counter_add(&stats->normal_count, &src->normal_count, 1);
-  mi_stat_counter_add(&stats->huge_count, &src->huge_count, 1);
-  mi_stat_counter_add(&stats->giant_count, &src->giant_count, 1);
+  mi_stat_counter_add(&stats->huge_count, &src->huge_count, 1);  
 #if MI_STAT>1
   for (size_t i = 0; i <= MI_BIN_HUGE; i++) {
     if (src->normal_bins[i].allocated > 0 || src->normal_bins[i].freed > 0) {
@@ -129,31 +131,35 @@ static void mi_stats_add(mi_stats_t* stats, const mi_stats_t* src) {
   Display statistics
 ----------------------------------------------------------- */
 
-// unit > 0 : size in binary bytes 
+// unit > 0 : size in binary bytes
 // unit == 0: count as decimal
 // unit < 0 : count in binary
 static void mi_printf_amount(int64_t n, int64_t unit, mi_output_fun* out, void* arg, const char* fmt) {
-  char buf[32];
+  char buf[32]; buf[0] = 0;
   int  len = 32;
-  const char* suffix = (unit <= 0 ? " " : "b");
+  const char* suffix = (unit <= 0 ? " " : "B");
   const int64_t base = (unit == 0 ? 1000 : 1024);
   if (unit>0) n *= unit;
 
   const int64_t pos = (n < 0 ? -n : n);
   if (pos < base) {
-    snprintf(buf, len, "%d %s ", (int)n, suffix);
+    if (n!=1 || suffix[0] != 'B') {  // skip printing 1 B for the unit column
+      _mi_snprintf(buf, len, "%lld   %-3s", (long long)n, (n==0 ? "" : suffix));
+    }
   }
   else {
     int64_t divider = base;
-    const char* magnitude = "k";
-    if (pos >= divider*base) { divider *= base; magnitude = "m"; }
-    if (pos >= divider*base) { divider *= base; magnitude = "g"; }
+    const char* magnitude = "K";
+    if (pos >= divider*base) { divider *= base; magnitude = "M"; }
+    if (pos >= divider*base) { divider *= base; magnitude = "G"; }
     const int64_t tens = (n / (divider/10));
     const long whole = (long)(tens/10);
     const long frac1 = (long)(tens%10);
-    snprintf(buf, len, "%ld.%ld %s%s", whole, (frac1 < 0 ? -frac1 : frac1), magnitude, suffix);
+    char unitdesc[8];
+    _mi_snprintf(unitdesc, 8, "%s%s%s", magnitude, (base==1024 ? "i" : ""), suffix);
+    _mi_snprintf(buf, len, "%ld.%ld %-3s", whole, (frac1 < 0 ? -frac1 : frac1), unitdesc);
   }
-  _mi_fprintf(out, arg, (fmt==NULL ? "%11s" : fmt), buf);
+  _mi_fprintf(out, arg, (fmt==NULL ? "%12s" : fmt), buf);
 }
 
 
@@ -162,58 +168,71 @@ static void mi_print_amount(int64_t n, int64_t unit, mi_output_fun* out, void* a
 }
 
 static void mi_print_count(int64_t n, int64_t unit, mi_output_fun* out, void* arg) {
-  if (unit==1) _mi_fprintf(out, arg, "%11s"," ");
+  if (unit==1) _mi_fprintf(out, arg, "%12s"," ");
           else mi_print_amount(n,0,out,arg);
 }
 
-static void mi_stat_print(const mi_stat_count_t* stat, const char* msg, int64_t unit, mi_output_fun* out, void* arg ) {
+static void mi_stat_print_ex(const mi_stat_count_t* stat, const char* msg, int64_t unit, mi_output_fun* out, void* arg, const char* notok ) {
   _mi_fprintf(out, arg,"%10s:", msg);
-  if (unit>0) {
-    mi_print_amount(stat->peak, unit, out, arg);
-    mi_print_amount(stat->allocated, unit, out, arg);
-    mi_print_amount(stat->freed, unit, out, arg);
-    mi_print_amount(stat->current, unit, out, arg);
-    mi_print_amount(unit, 1, out, arg);
-    mi_print_count(stat->allocated, unit, out, arg);
-    if (stat->allocated > stat->freed)
-      _mi_fprintf(out, arg, "  not all freed!\n");
-    else
-      _mi_fprintf(out, arg, "  ok\n");
-  }
-  else if (unit<0) {
-    mi_print_amount(stat->peak, -1, out, arg);
-    mi_print_amount(stat->allocated, -1, out, arg);
-    mi_print_amount(stat->freed, -1, out, arg);
-    mi_print_amount(stat->current, -1, out, arg);
-    if (unit==-1) {
-      _mi_fprintf(out, arg, "%22s", "");
+  if (unit != 0) {
+    if (unit > 0) {
+      mi_print_amount(stat->peak, unit, out, arg);
+      mi_print_amount(stat->allocated, unit, out, arg);
+      mi_print_amount(stat->freed, unit, out, arg);
+      mi_print_amount(stat->current, unit, out, arg);
+      mi_print_amount(unit, 1, out, arg);
+      mi_print_count(stat->allocated, unit, out, arg);
     }
     else {
-      mi_print_amount(-unit, 1, out, arg);
-      mi_print_count((stat->allocated / -unit), 0, out, arg);
+      mi_print_amount(stat->peak, -1, out, arg);
+      mi_print_amount(stat->allocated, -1, out, arg);
+      mi_print_amount(stat->freed, -1, out, arg);
+      mi_print_amount(stat->current, -1, out, arg);
+      if (unit == -1) {
+        _mi_fprintf(out, arg, "%24s", "");
+      }
+      else {
+        mi_print_amount(-unit, 1, out, arg);
+        mi_print_count((stat->allocated / -unit), 0, out, arg);
+      }
+    }
+    if (stat->allocated > stat->freed) {
+      _mi_fprintf(out, arg, "  ");
+      _mi_fprintf(out, arg, (notok == NULL ? "not all freed" : notok));
+      _mi_fprintf(out, arg, "\n");
     }
-    if (stat->allocated > stat->freed)
-      _mi_fprintf(out, arg, "  not all freed!\n");
-    else
+    else {
       _mi_fprintf(out, arg, "  ok\n");
+    }
   }
   else {
     mi_print_amount(stat->peak, 1, out, arg);
     mi_print_amount(stat->allocated, 1, out, arg);
-    _mi_fprintf(out, arg, "%11s", " ");  // no freed 
+    _mi_fprintf(out, arg, "%11s", " ");  // no freed
     mi_print_amount(stat->current, 1, out, arg);
     _mi_fprintf(out, arg, "\n");
   }
 }
 
+static void mi_stat_print(const mi_stat_count_t* stat, const char* msg, int64_t unit, mi_output_fun* out, void* arg) {
+  mi_stat_print_ex(stat, msg, unit, out, arg, NULL);
+}
+
+static void mi_stat_peak_print(const mi_stat_count_t* stat, const char* msg, int64_t unit, mi_output_fun* out, void* arg) {
+  _mi_fprintf(out, arg, "%10s:", msg);
+  mi_print_amount(stat->peak, unit, out, arg);
+  _mi_fprintf(out, arg, "\n");
+}
+
 static void mi_stat_counter_print(const mi_stat_counter_t* stat, const char* msg, mi_output_fun* out, void* arg ) {
   _mi_fprintf(out, arg, "%10s:", msg);
   mi_print_amount(stat->total, -1, out, arg);
   _mi_fprintf(out, arg, "\n");
 }
 
+
 static void mi_stat_counter_print_avg(const mi_stat_counter_t* stat, const char* msg, mi_output_fun* out, void* arg) {
-  const int64_t avg_tens = (stat->count == 0 ? 0 : (stat->total*10 / stat->count)); 
+  const int64_t avg_tens = (stat->count == 0 ? 0 : (stat->total*10 / stat->count));
   const long avg_whole = (long)(avg_tens/10);
   const long avg_frac1 = (long)(avg_tens%10);
   _mi_fprintf(out, arg, "%10s: %5ld.%ld avg\n", msg, avg_whole, avg_frac1);
@@ -221,7 +240,7 @@ static void mi_stat_counter_print_avg(const mi_stat_counter_t* stat, const char*
 
 
 static void mi_print_header(mi_output_fun* out, void* arg ) {
-  _mi_fprintf(out, arg, "%10s: %10s %10s %10s %10s %10s %10s\n", "heap stats", "peak  ", "total  ", "freed  ", "current  ", "unit  ", "count  ");
+  _mi_fprintf(out, arg, "%10s: %11s %11s %11s %11s %11s %11s\n", "heap stats", "peak   ", "total   ", "freed   ", "current   ", "unit   ", "count   ");
 }
 
 #if MI_STAT>1
@@ -232,7 +251,7 @@ static void mi_stats_print_bins(const mi_stat_count_t* bins, size_t max, const c
     if (bins[i].allocated > 0) {
       found = true;
       int64_t unit = _mi_bin_size((uint8_t)i);
-      snprintf(buf, 64, "%s %3lu", fmt, (long)i);
+      _mi_snprintf(buf, 64, "%s %3lu", fmt, (long)i);
       mi_stat_print(&bins[i], buf, unit, out, arg);
     }
   }
@@ -253,7 +272,7 @@ typedef struct buffered_s {
   mi_output_fun* out;   // original output function
   void*          arg;   // and state
   char*          buf;   // local buffer of at least size `count+1`
-  size_t         used;  // currently used chars `used <= count`  
+  size_t         used;  // currently used chars `used <= count`
   size_t         count; // total chars available for output
 } buffered_t;
 
@@ -263,7 +282,7 @@ static void mi_buffered_flush(buffered_t* buf) {
   buf->used = 0;
 }
 
-static void mi_buffered_out(const char* msg, void* arg) {
+static void mi_cdecl mi_buffered_out(const char* msg, void* arg) {
   buffered_t* buf = (buffered_t*)arg;
   if (msg==NULL || buf==NULL) return;
   for (const char* src = msg; *src != 0; src++) {
@@ -279,8 +298,6 @@ static void mi_buffered_out(const char* msg, void* arg) {
 // Print statistics
 //------------------------------------------------------------
 
-static void mi_stat_process_info(mi_msecs_t* elapsed, mi_msecs_t* utime, mi_msecs_t* stime, size_t* current_rss, size_t* peak_rss, size_t* current_commit, size_t* peak_commit, size_t* page_faults);
-
 static void _mi_stats_print(mi_stats_t* stats, mi_output_fun* out0, void* arg0) mi_attr_noexcept {
   // wrap the output function to be line buffered
   char buf[256];
@@ -296,21 +313,20 @@ static void _mi_stats_print(mi_stats_t* stats, mi_output_fun* out0, void* arg0)
   #endif
   #if MI_STAT
   mi_stat_print(&stats->normal, "normal", (stats->normal_count.count == 0 ? 1 : -(stats->normal.allocated / stats->normal_count.count)), out, arg);
-  mi_stat_print(&stats->huge, "huge", (stats->huge_count.count == 0 ? 1 : -(stats->huge.allocated / stats->huge_count.count)), out, arg);
-  mi_stat_print(&stats->giant, "giant", (stats->giant_count.count == 0 ? 1 : -(stats->giant.allocated / stats->giant_count.count)), out, arg);
+  mi_stat_print(&stats->huge, "huge", (stats->huge_count.count == 0 ? 1 : -(stats->huge.allocated / stats->huge_count.count)), out, arg);  
   mi_stat_count_t total = { 0,0,0,0 };
   mi_stat_add(&total, &stats->normal, 1);
   mi_stat_add(&total, &stats->huge, 1);
-  mi_stat_add(&total, &stats->giant, 1);
   mi_stat_print(&total, "total", 1, out, arg);
   #endif
   #if MI_STAT>1
   mi_stat_print(&stats->malloc, "malloc req", 1, out, arg);
   _mi_fprintf(out, arg, "\n");
   #endif
-  mi_stat_print(&stats->reserved, "reserved", 1, out, arg);
-  mi_stat_print(&stats->committed, "committed", 1, out, arg);
-  mi_stat_print(&stats->reset, "reset", 1, out, arg);
+  mi_stat_print_ex(&stats->reserved, "reserved", 1, out, arg, "");
+  mi_stat_print_ex(&stats->committed, "committed", 1, out, arg, "");
+  mi_stat_peak_print(&stats->reset, "reset", 1, out, arg );
+  mi_stat_peak_print(&stats->purged, "purged", 1, out, arg );
   mi_stat_print(&stats->page_committed, "touched", 1, out, arg);
   mi_stat_print(&stats->segments, "segments", -1, out, arg);
   mi_stat_print(&stats->segments_abandoned, "-abandoned", -1, out, arg);
@@ -319,22 +335,27 @@ static void _mi_stats_print(mi_stats_t* stats, mi_output_fun* out0, void* arg0)
   mi_stat_print(&stats->pages_abandoned, "-abandoned", -1, out, arg);
   mi_stat_counter_print(&stats->pages_extended, "-extended", out, arg);
   mi_stat_counter_print(&stats->page_no_retire, "-noretire", out, arg);
+  mi_stat_counter_print(&stats->arena_count, "arenas", out, arg);
+  mi_stat_counter_print(&stats->arena_crossover_count, "-crossover", out, arg);
+  mi_stat_counter_print(&stats->arena_rollback_count, "-rollback", out, arg);
   mi_stat_counter_print(&stats->mmap_calls, "mmaps", out, arg);
   mi_stat_counter_print(&stats->commit_calls, "commits", out, arg);
+  mi_stat_counter_print(&stats->reset_calls, "resets", out, arg);
+  mi_stat_counter_print(&stats->purge_calls, "purges", out, arg);
   mi_stat_print(&stats->threads, "threads", -1, out, arg);
   mi_stat_counter_print_avg(&stats->searches, "searches", out, arg);
-  _mi_fprintf(out, arg, "%10s: %7i\n", "numa nodes", _mi_os_numa_node_count());
-  
-  mi_msecs_t elapsed;
-  mi_msecs_t user_time;
-  mi_msecs_t sys_time;
+  _mi_fprintf(out, arg, "%10s: %5zu\n", "numa nodes", _mi_os_numa_node_count());
+
+  size_t elapsed;
+  size_t user_time;
+  size_t sys_time;
   size_t current_rss;
   size_t peak_rss;
   size_t current_commit;
   size_t peak_commit;
   size_t page_faults;
-  mi_stat_process_info(&elapsed, &user_time, &sys_time, &current_rss, &peak_rss, &current_commit, &peak_commit, &page_faults);
-  _mi_fprintf(out, arg, "%10s: %7ld.%03ld s\n", "elapsed", elapsed/1000, elapsed%1000);
+  mi_process_info(&elapsed, &user_time, &sys_time, &current_rss, &peak_rss, &current_commit, &peak_commit, &page_faults);
+  _mi_fprintf(out, arg, "%10s: %5ld.%03ld s\n", "elapsed", elapsed/1000, elapsed%1000);
   _mi_fprintf(out, arg, "%10s: user: %ld.%03ld s, system: %ld.%03ld s, faults: %lu, rss: ", "process",
               user_time/1000, user_time%1000, sys_time/1000, sys_time%1000, (unsigned long)page_faults );
   mi_printf_amount((int64_t)peak_rss, 1, out, arg, "%s");
@@ -342,7 +363,7 @@ static void _mi_stats_print(mi_stats_t* stats, mi_output_fun* out0, void* arg0)
     _mi_fprintf(out, arg, ", commit: ");
     mi_printf_amount((int64_t)peak_commit, 1, out, arg, "%s");
   }
-  _mi_fprintf(out, arg, "\n");  
+  _mi_fprintf(out, arg, "\n");
 }
 
 static mi_msecs_t mi_process_start; // = 0
@@ -392,42 +413,12 @@ void mi_thread_stats_print_out(mi_output_fun* out, void* arg) mi_attr_noexcept {
 // ----------------------------------------------------------------
 // Basic timer for convenience; use milli-seconds to avoid doubles
 // ----------------------------------------------------------------
-#ifdef _WIN32
-#include <windows.h>
-static mi_msecs_t mi_to_msecs(LARGE_INTEGER t) {
-  static LARGE_INTEGER mfreq; // = 0
-  if (mfreq.QuadPart == 0LL) {
-    LARGE_INTEGER f;
-    QueryPerformanceFrequency(&f);
-    mfreq.QuadPart = f.QuadPart/1000LL;
-    if (mfreq.QuadPart == 0) mfreq.QuadPart = 1;
-  }
-  return (mi_msecs_t)(t.QuadPart / mfreq.QuadPart);  
-}
+
+static mi_msecs_t mi_clock_diff;
 
 mi_msecs_t _mi_clock_now(void) {
-  LARGE_INTEGER t;
-  QueryPerformanceCounter(&t);
-  return mi_to_msecs(t);
-}
-#else
-#include <time.h>
-#ifdef CLOCK_REALTIME
-mi_msecs_t _mi_clock_now(void) {
-  struct timespec t;
-  clock_gettime(CLOCK_REALTIME, &t);
-  return ((mi_msecs_t)t.tv_sec * 1000) + ((mi_msecs_t)t.tv_nsec / 1000000);
-}
-#else
-// low resolution timer
-mi_msecs_t _mi_clock_now(void) {
-  return ((mi_msecs_t)clock() / ((mi_msecs_t)CLOCKS_PER_SEC / 1000));
+  return _mi_prim_clock_now();
 }
-#endif
-#endif
-
-
-static mi_msecs_t mi_clock_diff;
 
 mi_msecs_t _mi_clock_start(void) {
   if (mi_clock_diff == 0.0) {
@@ -447,129 +438,27 @@ mi_msecs_t _mi_clock_end(mi_msecs_t start) {
 // Basic process statistics
 // --------------------------------------------------------
 
-#if defined(_WIN32)
-#include <windows.h>
-#include <psapi.h>
-#pragma comment(lib,"psapi.lib")
-
-static mi_msecs_t filetime_msecs(const FILETIME* ftime) {
-  ULARGE_INTEGER i;
-  i.LowPart = ftime->dwLowDateTime;
-  i.HighPart = ftime->dwHighDateTime;
-  mi_msecs_t msecs = (i.QuadPart / 10000); // FILETIME is in 100 nano seconds
-  return msecs;
-}
-
-static void mi_stat_process_info(mi_msecs_t* elapsed, mi_msecs_t* utime, mi_msecs_t* stime, size_t* current_rss, size_t* peak_rss, size_t* current_commit, size_t* peak_commit, size_t* page_faults) 
-{
-  *elapsed = _mi_clock_end(mi_process_start);
-  FILETIME ct;
-  FILETIME ut;
-  FILETIME st;
-  FILETIME et;
-  GetProcessTimes(GetCurrentProcess(), &ct, &et, &st, &ut);
-  *utime = filetime_msecs(&ut);
-  *stime = filetime_msecs(&st);
-  PROCESS_MEMORY_COUNTERS info;
-  GetProcessMemoryInfo(GetCurrentProcess(), &info, sizeof(info));
-  *current_rss    = (size_t)info.WorkingSetSize;
-  *peak_rss       = (size_t)info.PeakWorkingSetSize;
-  *current_commit = (size_t)info.PagefileUsage;
-  *peak_commit    = (size_t)info.PeakPagefileUsage;
-  *page_faults    = (size_t)info.PageFaultCount;  
-}
-
-#elif defined(__unix__) || defined(__unix) || defined(unix) || defined(__APPLE__) || defined(__HAIKU__)
-#include <stdio.h>
-#include <unistd.h>
-#include <sys/resource.h>
-
-#if defined(__APPLE__)
-#include <mach/mach.h>
-#endif
-
-#if defined(__HAIKU__)
-#error #include <kernel/OS.h>
-#endif
-
-static mi_msecs_t timeval_secs(const struct timeval* tv) {
-  return ((mi_msecs_t)tv->tv_sec * 1000L) + ((mi_msecs_t)tv->tv_usec / 1000L);
-}
-
-static void mi_stat_process_info(mi_msecs_t* elapsed, mi_msecs_t* utime, mi_msecs_t* stime, size_t* current_rss, size_t* peak_rss, size_t* current_commit, size_t* peak_commit, size_t* page_faults)
-{
-  *elapsed = _mi_clock_end(mi_process_start);
-  struct rusage rusage;
-  getrusage(RUSAGE_SELF, &rusage);
-  *utime = timeval_secs(&rusage.ru_utime);
-  *stime = timeval_secs(&rusage.ru_stime);
-#if !defined(__HAIKU__)
-  *page_faults = rusage.ru_majflt;
-#endif
-  // estimate commit using our stats
-  *peak_commit    = (size_t)(mi_atomic_loadi64_relaxed((_Atomic(int64_t)*)&_mi_stats_main.committed.peak));
-  *current_commit = (size_t)(mi_atomic_loadi64_relaxed((_Atomic(int64_t)*)&_mi_stats_main.committed.current));
-  *current_rss    = *current_commit;  // estimate 
-#if defined(__HAIKU__)
-  // Haiku does not have (yet?) a way to
-  // get these stats per process
-  thread_info tid;
-  area_info mem;
-  ssize_t c;
-  get_thread_info(find_thread(0), &tid);
-  while (get_next_area_info(tid.team, &c, &mem) == B_OK) {
-    *peak_rss += mem.ram_size;
-  }
-#elif defined(__APPLE__)
-  *peak_rss = rusage.ru_maxrss;         // BSD reports in bytes
-  struct mach_task_basic_info info;
-  mach_msg_type_number_t infoCount = MACH_TASK_BASIC_INFO_COUNT;
-  if (task_info(mach_task_self(), MACH_TASK_BASIC_INFO, (task_info_t)&info, &infoCount) == KERN_SUCCESS) {
-    *current_rss = (size_t)info.resident_size;
-  }
-#else
-  *peak_rss = rusage.ru_maxrss * 1024;  // Linux reports in KiB
-#endif  
-}
-
-#else
-#ifndef __wasi__
-// WebAssembly instances are not processes
-#pragma message("define a way to get process info")
-#endif
-
-static void mi_stat_process_info(mi_msecs_t* elapsed, mi_msecs_t* utime, mi_msecs_t* stime, size_t* current_rss, size_t* peak_rss, size_t* current_commit, size_t* peak_commit, size_t* page_faults)
-{
-  *elapsed = _mi_clock_end(mi_process_start);
-  *peak_commit    = (size_t)(mi_atomic_loadi64_relaxed((_Atomic(int64_t)*)&_mi_stats_main.committed.peak));
-  *current_commit = (size_t)(mi_atomic_loadi64_relaxed((_Atomic(int64_t)*)&_mi_stats_main.committed.current));
-  *peak_rss    = *peak_commit;
-  *current_rss = *current_commit;
-  *page_faults = 0;
-  *utime = 0;
-  *stime = 0;
-}
-#endif
-
-
 mi_decl_export void mi_process_info(size_t* elapsed_msecs, size_t* user_msecs, size_t* system_msecs, size_t* current_rss, size_t* peak_rss, size_t* current_commit, size_t* peak_commit, size_t* page_faults) mi_attr_noexcept
 {
-  mi_msecs_t elapsed = 0;
-  mi_msecs_t utime = 0;
-  mi_msecs_t stime = 0;
-  size_t current_rss0 = 0;
-  size_t peak_rss0 = 0;
-  size_t current_commit0 = 0;
-  size_t peak_commit0 = 0;
-  size_t page_faults0 = 0;  
-  mi_stat_process_info(&elapsed,&utime, &stime, &current_rss0, &peak_rss0, &current_commit0, &peak_commit0, &page_faults0);
-  if (elapsed_msecs!=NULL)  *elapsed_msecs = (elapsed < 0 ? 0 : (elapsed < (mi_msecs_t)PTRDIFF_MAX ? (size_t)elapsed : PTRDIFF_MAX));
-  if (user_msecs!=NULL)     *user_msecs     = (utime < 0 ? 0 : (utime < (mi_msecs_t)PTRDIFF_MAX ? (size_t)utime : PTRDIFF_MAX));
-  if (system_msecs!=NULL)   *system_msecs   = (stime < 0 ? 0 : (stime < (mi_msecs_t)PTRDIFF_MAX ? (size_t)stime : PTRDIFF_MAX));
-  if (current_rss!=NULL)    *current_rss    = current_rss0;
-  if (peak_rss!=NULL)       *peak_rss       = peak_rss0;
-  if (current_commit!=NULL) *current_commit = current_commit0;
-  if (peak_commit!=NULL)    *peak_commit    = peak_commit0;
-  if (page_faults!=NULL)    *page_faults    = page_faults0;
+  mi_process_info_t pinfo;
+  _mi_memzero_var(pinfo);
+  pinfo.elapsed        = _mi_clock_end(mi_process_start);
+  pinfo.current_commit = (size_t)(mi_atomic_loadi64_relaxed((_Atomic(int64_t)*)&_mi_stats_main.committed.current));
+  pinfo.peak_commit    = (size_t)(mi_atomic_loadi64_relaxed((_Atomic(int64_t)*)&_mi_stats_main.committed.peak));
+  pinfo.current_rss    = pinfo.current_commit;
+  pinfo.peak_rss       = pinfo.peak_commit;
+  pinfo.utime          = 0;
+  pinfo.stime          = 0;
+  pinfo.page_faults    = 0;
+
+  _mi_prim_process_info(&pinfo);
+  
+  if (elapsed_msecs!=NULL)  *elapsed_msecs  = (pinfo.elapsed < 0 ? 0 : (pinfo.elapsed < (mi_msecs_t)PTRDIFF_MAX ? (size_t)pinfo.elapsed : PTRDIFF_MAX));
+  if (user_msecs!=NULL)     *user_msecs     = (pinfo.utime < 0 ? 0 : (pinfo.utime < (mi_msecs_t)PTRDIFF_MAX ? (size_t)pinfo.utime : PTRDIFF_MAX));
+  if (system_msecs!=NULL)   *system_msecs   = (pinfo.stime < 0 ? 0 : (pinfo.stime < (mi_msecs_t)PTRDIFF_MAX ? (size_t)pinfo.stime : PTRDIFF_MAX));
+  if (current_rss!=NULL)    *current_rss    = pinfo.current_rss;
+  if (peak_rss!=NULL)       *peak_rss       = pinfo.peak_rss;
+  if (current_commit!=NULL) *current_commit = pinfo.current_commit;
+  if (peak_commit!=NULL)    *peak_commit    = pinfo.peak_commit;
+  if (page_faults!=NULL)    *page_faults    = pinfo.page_faults;
 }
-
diff --git a/contrib/libs/mimalloc/ya.make b/contrib/libs/mimalloc/ya.make
index b23b8c23942f..cad456e3eaa4 100644
--- a/contrib/libs/mimalloc/ya.make
+++ b/contrib/libs/mimalloc/ya.make
@@ -1,21 +1,51 @@
-LIBRARY()
+# Generated by devtools/yamaker from nixpkgs 22.11.
 
-CFLAGS(
-    -w
-    -DMI_MALLOC_OVERRIDE=1
-    -DMI_PADDING=0
-)
+LIBRARY()
 
 LICENSE(MIT)
 
 LICENSE_TEXTS(.yandex_meta/licenses.list.txt)
 
-VERSION(1.7.2)
+VERSION(1.8.7)
+
+ORIGINAL_SOURCE(https://github.com/microsoft/mimalloc/archive/refs/tags/v1.8.7.tar.gz)
+
+PEERDIR(
+    library/cpp/sanitizer/include
+)
+
+ADDINCL(
+    GLOBAL contrib/libs/mimalloc/include
+)
+
+NO_COMPILER_WARNINGS()
+
+NO_RUNTIME()
+
+CFLAGS(
+    -DMI_MALLOC_OVERRIDE
+    -DMI_SHARED_LIB
+    -DMI_SHARED_LIB_EXPORT
+    -DMI_STATIC_LIB
+)
 
-ADDINCL(contrib/libs/mimalloc/include)
-NO_UTIL()
 SRCS(
-    src/static.c
+    src/alloc-aligned.c
+    src/alloc-posix.c
+    src/alloc.c
+    src/arena.c
+    src/bitmap.c
+    src/heap.c
+    src/init.c
+    src/libc.c
+    src/options.c
+    src/os.c
+    src/page.c
+    src/prim/prim.c
+    src/random.c
+    src/segment-map.c
+    src/segment.c
+    src/stats.c
 )
 
 END()

From b21317884da9622ac16bbad26627ae014d72d6a7 Mon Sep 17 00:00:00 2001
From: arkady-e1ppa <arkady-e1ppa@yandex-team.com>
Date: Fri, 29 Nov 2024 22:02:21 +0300
Subject: [PATCH 07/16] YT-21233: Introduce ToAttributeValue cpo which removes
 dependency of TErrorAttribute from yt/core
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

Plan:

 1) Remove `IAttributedDictionary` type from the public API. \+

 2) Remove `Set` method from public API in favor of `operator<<=`. \+

 3) Adopt `ConvertTo<T>` (or other name) CPO with proper extension into `NYT::NYson::ConvertTo` from `yt/core`.

 4) Use CPO from (3) to eliminate direct dependency on `yt/core` of `Get/Find` methods from attributes API.

 5) Adopt `ConvertToYsonString` (or other name) CPO with proper extension into `yt/core` customisations.

 6) Use CPO from (5) to eliminate direct dependency on `yt/core` of `TErrorAttribute` ctor.

 7) Swap attributes implementation to the one which doesn’t use `IAttributeDictionary`.

 8) At this point `stripped_error*` can be moved to library/cpp/yt and so can recursively dependant on THROW macro methods `Get/Find/…`.

 9) Adjust CPO’s to work with `std::string` instead of `TYsonString` assuming text format to be used (maybe `TString` for now).

10) Remove dep of `library/cpp/yt/error` on `yson` entirely.

This pr addresses 5-6 steps of plan. Below is a brief explanation of design decisions. We also expressed everything related to key-value code in terms of aliases of `TErrorAttribute` so that later we could make a relatively simple switch to `std::string`. We expect to do steps 7-8 in the next pull request as everything should be ready
commit_hash:de9feca2bd24d823b33d904ef0fa5f4856f9b020
---
 library/cpp/yt/error/convert_to_cpo.h       |  4 +-
 library/cpp/yt/error/error_attribute.h      | 57 +++++++++++++++++++++
 library/cpp/yt/error/error_attributes-inl.h | 18 +++----
 library/cpp/yt/error/error_attributes.cpp   |  2 +-
 library/cpp/yt/error/error_attributes.h     | 35 +++++++------
 library/cpp/yt/error/mergeable_dictionary.h |  6 ++-
 library/cpp/yt/error/public.h               |  2 +
 yt/yt/core/misc/error-inl.h                 | 24 +++++++++
 yt/yt/core/misc/error.h                     |  4 ++
 yt/yt/core/misc/stripped_error.cpp          | 10 ++--
 yt/yt/core/misc/stripped_error.h            | 19 -------
 yt/yt/core/ytree/convert-inl.h              | 32 +++++++-----
 12 files changed, 143 insertions(+), 70 deletions(-)
 create mode 100644 library/cpp/yt/error/error_attribute.h
 create mode 100644 yt/yt/core/misc/error-inl.h

diff --git a/library/cpp/yt/error/convert_to_cpo.h b/library/cpp/yt/error/convert_to_cpo.h
index 55f608c3b605..dcfc8b0677db 100644
--- a/library/cpp/yt/error/convert_to_cpo.h
+++ b/library/cpp/yt/error/convert_to_cpo.h
@@ -2,8 +2,6 @@
 
 #include <library/cpp/yt/misc/tag_invoke_cpo.h>
 
-#include <util/generic/strbuf.h>
-
 namespace NYT {
 
 ////////////////////////////////////////////////////////////////////////////////
@@ -34,7 +32,7 @@ inline constexpr NConvertToImpl::TFn<T> ConvertTo = {};
 ////////////////////////////////////////////////////////////////////////////////
 
 template <class TTo, class TFrom>
-concept CConvertToWorks = requires (const TFrom& from) {
+concept CConvertsTo = requires (const TFrom& from) {
     { NYT::ConvertTo<TTo>(from) } -> std::same_as<TTo>;
 };
 
diff --git a/library/cpp/yt/error/error_attribute.h b/library/cpp/yt/error/error_attribute.h
new file mode 100644
index 000000000000..dec4a4dd9b34
--- /dev/null
+++ b/library/cpp/yt/error/error_attribute.h
@@ -0,0 +1,57 @@
+#pragma once
+
+#include <library/cpp/yt/misc/tag_invoke_cpo.h>
+
+// TODO(arkady-e1ppa): Eliminate.
+#include <library/cpp/yt/yson_string/string.h>
+
+namespace NYT {
+
+////////////////////////////////////////////////////////////////////////////////
+
+namespace NToAttributeValueImpl {
+
+struct TFn
+    : public TTagInvokeCpoBase<TFn>
+{ };
+
+} // namespace NToAttributeValueImpl
+
+////////////////////////////////////////////////////////////////////////////////
+
+inline constexpr NToAttributeValueImpl::TFn ToAttributeValue = {};
+
+////////////////////////////////////////////////////////////////////////////////
+
+template <class T>
+concept CConvertibleToAttributeValue = CTagInvocableS<
+    TTagInvokeTag<ToAttributeValue>,
+    NYson::TYsonString(const T&)>;
+
+////////////////////////////////////////////////////////////////////////////////
+
+struct TErrorAttribute
+{
+    // NB(arkady-e1ppa): Switch to std::string is quite possible
+    // however it requires patching IAttributeDictionary or
+    // switching it to the std::string first for interop reasons.
+    // Do that later.
+    using TKey = TString;
+    // TODO(arkady-e1ppa): Use ConvertToYsonString(value, Format::Text)
+    // here for complex values. Write manual implementations as ToString
+    // for primitive types (e.g. integral types, guid, string, time).
+    using TValue = NYson::TYsonString;
+
+    template <CConvertibleToAttributeValue T>
+    TErrorAttribute(const TKey& key, const T& value)
+        : Key(key)
+        , Value(NYT::ToAttributeValue(value))
+    { }
+
+    TKey Key;
+    TValue Value;
+};
+
+////////////////////////////////////////////////////////////////////////////////
+
+} // namespace NYT
diff --git a/library/cpp/yt/error/error_attributes-inl.h b/library/cpp/yt/error/error_attributes-inl.h
index 8ffbf3e0a394..6574665865be 100644
--- a/library/cpp/yt/error/error_attributes-inl.h
+++ b/library/cpp/yt/error/error_attributes-inl.h
@@ -9,7 +9,7 @@ namespace NYT {
 ////////////////////////////////////////////////////////////////////////////////
 
 template <class T>
-    requires CConvertToWorks<T, TErrorAttributes::TValue>
+    requires CConvertsTo<T, TErrorAttributes::TValue>
 T TErrorAttributes::Get(TStringBuf key) const
 {
     auto yson = GetYson(key);
@@ -21,7 +21,7 @@ T TErrorAttributes::Get(TStringBuf key) const
 }
 
 template <class T>
-    requires CConvertToWorks<T, TErrorAttributes::TValue>
+    requires CConvertsTo<T, TErrorAttributes::TValue>
 typename TOptionalTraits<T>::TOptional TErrorAttributes::Find(TStringBuf key) const
 {
     auto yson = FindYson(key);
@@ -36,8 +36,8 @@ typename TOptionalTraits<T>::TOptional TErrorAttributes::Find(TStringBuf key) co
 }
 
 template <class T>
-    requires CConvertToWorks<T, TErrorAttributes::TValue>
-T TErrorAttributes::GetAndRemove(const TString& key)
+    requires CConvertsTo<T, TErrorAttributes::TValue>
+T TErrorAttributes::GetAndRemove(const TKey& key)
 {
     auto result = Get<T>(key);
     Remove(key);
@@ -45,15 +45,15 @@ T TErrorAttributes::GetAndRemove(const TString& key)
 }
 
 template <class T>
-    requires CConvertToWorks<T, TErrorAttributes::TValue>
+    requires CConvertsTo<T, TErrorAttributes::TValue>
 T TErrorAttributes::Get(TStringBuf key, const T& defaultValue) const
 {
     return Find<T>(key).value_or(defaultValue);
 }
 
 template <class T>
-    requires CConvertToWorks<T, TErrorAttributes::TValue>
-T TErrorAttributes::GetAndRemove(const TString& key, const T& defaultValue)
+    requires CConvertsTo<T, TErrorAttributes::TValue>
+T TErrorAttributes::GetAndRemove(const TKey& key, const T& defaultValue)
 {
     auto result = Find<T>(key);
     if (result) {
@@ -65,8 +65,8 @@ T TErrorAttributes::GetAndRemove(const TString& key, const T& defaultValue)
 }
 
 template <class T>
-    requires CConvertToWorks<T, TErrorAttributes::TValue>
-typename TOptionalTraits<T>::TOptional TErrorAttributes::FindAndRemove(const TString& key)
+    requires CConvertsTo<T, TErrorAttributes::TValue>
+typename TOptionalTraits<T>::TOptional TErrorAttributes::FindAndRemove(const TKey& key)
 {
     auto result = Find<T>(key);
     if (result) {
diff --git a/library/cpp/yt/error/error_attributes.cpp b/library/cpp/yt/error/error_attributes.cpp
index 2c1b54231463..09aa48eebb96 100644
--- a/library/cpp/yt/error/error_attributes.cpp
+++ b/library/cpp/yt/error/error_attributes.cpp
@@ -15,7 +15,7 @@ void TErrorAttributes::Clear()
     }
 }
 
-NYson::TYsonString TErrorAttributes::GetYsonAndRemove(const TString& key)
+TErrorAttributes::TValue TErrorAttributes::GetYsonAndRemove(const TKey& key)
 {
     auto result = GetYson(key);
     Remove(key);
diff --git a/library/cpp/yt/error/error_attributes.h b/library/cpp/yt/error/error_attributes.h
index 7c2f4a540783..80dd80de484e 100644
--- a/library/cpp/yt/error/error_attributes.h
+++ b/library/cpp/yt/error/error_attributes.h
@@ -1,6 +1,7 @@
 #pragma once
 
 #include "convert_to_cpo.h"
+#include "error_attribute.h"
 #include "mergeable_dictionary.h"
 
 #include <library/cpp/yt/misc/optional.h>
@@ -20,34 +21,34 @@ namespace NYT {
 class TErrorAttributes
 {
 public:
-    using TKey = TString;
-    using TValue = NYson::TYsonString;
+    using TKey = TErrorAttribute::TKey;
+    using TValue = TErrorAttribute::TValue;
     using TKeyValuePair = std::pair<TKey, TValue>;
 
     //! Returns the list of all keys in the dictionary.
-    std::vector<TString> ListKeys() const;
+    std::vector<TKey> ListKeys() const;
 
     //! Returns the list of all key-value pairs in the dictionary.
     std::vector<TKeyValuePair> ListPairs() const;
 
     //! Returns the value of the attribute (null indicates that the attribute is not found).
-    NYson::TYsonString FindYson(TStringBuf key) const;
+    TValue FindYson(TStringBuf key) const;
 
     //! Sets the value of the attribute.
-    void SetYson(const TString& key, const NYson::TYsonString& value);
+    void SetYson(const TKey& key, const TValue& value);
 
     //! Removes the attribute.
     //! Returns |true| if the attribute was removed or |false| if there is no attribute with this key.
-    bool Remove(const TString& key);
+    bool Remove(const TKey& key);
 
     //! Removes all attributes.
     void Clear();
 
     //! Returns the value of the attribute (throws an exception if the attribute is not found).
-    NYson::TYsonString GetYson(TStringBuf key) const;
+    TValue GetYson(TStringBuf key) const;
 
     //! Same as #GetYson but removes the value.
-    NYson::TYsonString GetYsonAndRemove(const TString& key);
+    TValue GetYsonAndRemove(const TKey& key);
 
     //! Returns |true| iff the given key is present.
     bool Contains(TStringBuf key) const;
@@ -58,35 +59,35 @@ class TErrorAttributes
     //! Finds the attribute and deserializes its value.
     //! Throws if no such value is found.
     template <class T>
-        requires CConvertToWorks<T, TValue>
+        requires CConvertsTo<T, TValue>
     T Get(TStringBuf key) const;
 
     //! Same as #Get but removes the value.
     template <class T>
-        requires CConvertToWorks<T, TValue>
-    T GetAndRemove(const TString& key);
+        requires CConvertsTo<T, TValue>
+    T GetAndRemove(const TKey& key);
 
     //! Finds the attribute and deserializes its value.
     //! Uses default value if no such attribute is found.
     template <class T>
-        requires CConvertToWorks<T, TValue>
+        requires CConvertsTo<T, TValue>
     T Get(TStringBuf key, const T& defaultValue) const;
 
     //! Same as #Get but removes the value if it exists.
     template <class T>
-        requires CConvertToWorks<T, TValue>
-    T GetAndRemove(const TString& key, const T& defaultValue);
+        requires CConvertsTo<T, TValue>
+    T GetAndRemove(const TKey& key, const T& defaultValue);
 
     //! Finds the attribute and deserializes its value.
     //! Returns null if no such attribute is found.
     template <class T>
-        requires CConvertToWorks<T, TValue>
+        requires CConvertsTo<T, TValue>
     typename TOptionalTraits<T>::TOptional Find(TStringBuf key) const;
 
     //! Same as #Find but removes the value if it exists.
     template <class T>
-        requires CConvertToWorks<T, TValue>
-    typename TOptionalTraits<T>::TOptional FindAndRemove(const TString& key);
+        requires CConvertsTo<T, TValue>
+    typename TOptionalTraits<T>::TOptional FindAndRemove(const TKey& key);
 
     template <CMergeableDictionary TDictionary>
     void MergeFrom(const TDictionary& dict);
diff --git a/library/cpp/yt/error/mergeable_dictionary.h b/library/cpp/yt/error/mergeable_dictionary.h
index 90597059e406..361694d84163 100644
--- a/library/cpp/yt/error/mergeable_dictionary.h
+++ b/library/cpp/yt/error/mergeable_dictionary.h
@@ -1,6 +1,7 @@
 #pragma once
 
 #include "public.h"
+#include "error_attribute.h"
 
 #include <ranges>
 
@@ -23,6 +24,7 @@ namespace NDetail {
 template <class T>
 struct TMergeableDictionaryImpl
 {
+    // TL;DR: MakeIterableView returns something like std::span<std::pair<TKey, TValue>>.
     using TView = std::invoke_result_t<decltype(&TMergeDictionariesTraits<T>::MakeIterableView), const T&>;
     using TIterator = std::ranges::iterator_t<TView>;
     using TValue = typename std::iterator_traits<TIterator>::value_type;
@@ -33,10 +35,10 @@ struct TMergeableDictionaryImpl
 
     static constexpr bool CorrectTupleElements = requires {
         typename std::tuple_element<0, TValue>::type;
-        std::same_as<typename std::tuple_element<0, TValue>::type, TString>;
+        std::same_as<typename std::tuple_element<0, TValue>::type, TErrorAttribute::TKey>;
 
         typename std::tuple_element<1, TValue>::type;
-        std::same_as<typename std::tuple_element<1, TValue>::type, NYson::TYsonString>;
+        std::same_as<typename std::tuple_element<1, TValue>::type, TErrorAttribute::TValue>;
     };
 };
 
diff --git a/library/cpp/yt/error/public.h b/library/cpp/yt/error/public.h
index 63b95497c7b4..04201128aa39 100644
--- a/library/cpp/yt/error/public.h
+++ b/library/cpp/yt/error/public.h
@@ -9,6 +9,8 @@ namespace NYT {
 template <class T>
 class TErrorOr;
 
+
+struct TErrorAttribute;
 class TErrorAttributes;
 struct TOriginAttributes;
 
diff --git a/yt/yt/core/misc/error-inl.h b/yt/yt/core/misc/error-inl.h
new file mode 100644
index 000000000000..3185de11b2e7
--- /dev/null
+++ b/yt/yt/core/misc/error-inl.h
@@ -0,0 +1,24 @@
+#ifndef ERROR_INL_H_
+#error "Direct inclusion of this file is not allowed, include error.h"
+// For the sake of sane code completion.
+#include "error.h"
+#endif
+
+namespace NYT::NToAttributeValueImpl {
+
+////////////////////////////////////////////////////////////////////////////////
+
+template <class T>
+NYson::TYsonString TagInvoke(TTagInvokeTag<ToAttributeValue>, const T& value)
+{
+    return NYson::ConvertToYsonString(value);
+}
+
+inline NYson::TYsonString TagInvoke(TTagInvokeTag<ToAttributeValue>, const NYson::TYsonString& value)
+{
+    return value;
+}
+
+////////////////////////////////////////////////////////////////////////////////
+
+} // namespace NYT::NToAttributeValueImpl
diff --git a/yt/yt/core/misc/error.h b/yt/yt/core/misc/error.h
index f3c773da3386..8399b49301d4 100644
--- a/yt/yt/core/misc/error.h
+++ b/yt/yt/core/misc/error.h
@@ -91,3 +91,7 @@ struct TSerializerTraits<
 ////////////////////////////////////////////////////////////////////////////////
 
 } // namespace NYT
+
+#define ERROR_INL_H_
+#include "error-inl.h"
+#undef ERROR_INL_H_
diff --git a/yt/yt/core/misc/stripped_error.cpp b/yt/yt/core/misc/stripped_error.cpp
index e43a61953f12..5be93260f930 100644
--- a/yt/yt/core/misc/stripped_error.cpp
+++ b/yt/yt/core/misc/stripped_error.cpp
@@ -205,7 +205,7 @@ class TError::TImpl
 // which has minimal API of original dict with backend being the
 // actual original dict. Once API-related issues are fixed we are
 // free to implement a backend which doesn't depend on original dict.
-std::vector<TString> TErrorAttributes::ListKeys() const
+std::vector<TErrorAttributes::TKey> TErrorAttributes::ListKeys() const
 {
     auto* attributes = static_cast<IAttributeDictionary*>(Attributes_);
     if (!attributes) {
@@ -223,7 +223,7 @@ std::vector<TErrorAttributes::TKeyValuePair> TErrorAttributes::ListPairs() const
     return attributes->ListPairs();
 }
 
-NYson::TYsonString TErrorAttributes::FindYson(TStringBuf key) const
+TErrorAttributes::TValue TErrorAttributes::FindYson(TStringBuf key) const
 {
     auto* attributes = static_cast<IAttributeDictionary*>(Attributes_);
     if (!attributes) {
@@ -232,14 +232,14 @@ NYson::TYsonString TErrorAttributes::FindYson(TStringBuf key) const
     return attributes->FindYson(key);
 }
 
-void TErrorAttributes::SetYson(const TString& key, const NYson::TYsonString& value)
+void TErrorAttributes::SetYson(const TKey& key, const TValue& value)
 {
     auto* attributes = static_cast<IAttributeDictionary*>(Attributes_);
     YT_VERIFY(attributes);
     return attributes->SetYson(key, value);
 }
 
-bool TErrorAttributes::Remove(const TString& key)
+bool TErrorAttributes::Remove(const TKey& key)
 {
     auto* attributes = static_cast<IAttributeDictionary*>(Attributes_);
     if (!attributes) {
@@ -248,7 +248,7 @@ bool TErrorAttributes::Remove(const TString& key)
     return attributes->Remove(key);
 }
 
-NYson::TYsonString TErrorAttributes::GetYson(TStringBuf key) const
+TErrorAttributes::TValue TErrorAttributes::GetYson(TStringBuf key) const
 {
     auto result = FindYson(key);
     if (!result) {
diff --git a/yt/yt/core/misc/stripped_error.h b/yt/yt/core/misc/stripped_error.h
index ad985394b8fb..6dd7b4b87eec 100644
--- a/yt/yt/core/misc/stripped_error.h
+++ b/yt/yt/core/misc/stripped_error.h
@@ -60,25 +60,6 @@ void FormatValue(TStringBuilderBase* builder, TErrorCode code, TStringBuf spec);
 
 ////////////////////////////////////////////////////////////////////////////////
 
-struct TErrorAttribute
-{
-    template <class T>
-    TErrorAttribute(const TString& key, const T& value)
-        : Key(key)
-        , Value(NYson::ConvertToYsonString(value))
-    { }
-
-    TErrorAttribute(const TString& key, const NYson::TYsonString& value)
-        : Key(key)
-        , Value(value)
-    { }
-
-    TString Key;
-    NYson::TYsonString Value;
-};
-
-////////////////////////////////////////////////////////////////////////////////
-
 template <class TValue>
 concept CErrorNestable = requires (TError& error, TValue&& operand)
 {
diff --git a/yt/yt/core/ytree/convert-inl.h b/yt/yt/core/ytree/convert-inl.h
index 694aca2d81f6..0e3385ed5bf5 100644
--- a/yt/yt/core/ytree/convert-inl.h
+++ b/yt/yt/core/ytree/convert-inl.h
@@ -182,12 +182,20 @@ T ConstructYTreeConvertibleObject()
 
 ////////////////////////////////////////////////////////////////////////////////
 
-namespace {
+} // namespace NYT::NYTree
+
+////////////////////////////////////////////////////////////////////////////////
+
+namespace NYT::NConvertToImpl {
 
 ////////////////////////////////////////////////////////////////////////////////
 
+namespace {
+
 double ConvertYsonStringBaseToDouble(const NYson::TYsonStringBuf& yson)
 {
+    using namespace NYT::NYTree;
+
     NYson::TTokenizer tokenizer(yson.AsStringBuf());
     const auto& token = SkipAttributes(&tokenizer);
     switch (token.GetType()) {
@@ -204,8 +212,12 @@ double ConvertYsonStringBaseToDouble(const NYson::TYsonStringBuf& yson)
     }
 }
 
+////////////////////////////////////////////////////////////////////////////////
+
 TString ConvertYsonStringBaseToString(const NYson::TYsonStringBuf& yson)
 {
+    using namespace NYT::NYTree;
+
     NYson::TTokenizer tokenizer(yson.AsStringBuf());
     const auto& token = SkipAttributes(&tokenizer);
     switch (token.GetType()) {
@@ -218,15 +230,7 @@ TString ConvertYsonStringBaseToString(const NYson::TYsonStringBuf& yson)
     }
 }
 
-////////////////////////////////////////////////////////////////////////////////
-
-}
-
-////////////////////////////////////////////////////////////////////////////////
-
-} // namespace NYT::NYTree
-
-namespace NYT::NConvertToImpl {
+} // namespace
 
 ////////////////////////////////////////////////////////////////////////////////
 
@@ -314,25 +318,25 @@ IMPLEMENT_CHECKED_INTEGRAL_CONVERT_TO(ui8)
 template <>
 inline double TagInvoke(TTagInvokeTag<ConvertTo<double>>, const NYson::TYsonString& str)
 {
-    return NYTree::ConvertYsonStringBaseToDouble(str);
+    return ConvertYsonStringBaseToDouble(str);
 }
 
 template <>
 inline double TagInvoke(TTagInvokeTag<ConvertTo<double>>, const NYson::TYsonStringBuf& str)
 {
-    return NYTree::ConvertYsonStringBaseToDouble(str);
+    return ConvertYsonStringBaseToDouble(str);
 }
 
 template <>
 inline TString TagInvoke(TTagInvokeTag<ConvertTo<TString>>, const NYson::TYsonString& str)
 {
-    return NYTree::ConvertYsonStringBaseToString(str);
+    return ConvertYsonStringBaseToString(str);
 }
 
 template <>
 inline TString TagInvoke(TTagInvokeTag<ConvertTo<TString>>, const NYson::TYsonStringBuf& str)
 {
-    return NYTree::ConvertYsonStringBaseToString(str);
+    return ConvertYsonStringBaseToString(str);
 }
 
 ////////////////////////////////////////////////////////////////////////////////

From c88c90429a4914a0b120be7191d1572c43cc2a55 Mon Sep 17 00:00:00 2001
From: udovichenko-r <udovichenko-r@yandex-team.com>
Date: Fri, 29 Nov 2024 22:23:21 +0300
Subject: [PATCH 08/16] YQL-19309 Move s-expression large tests

1. Move tests under yt/yql

2. Enable Python tests (was disabled in github)

3. Remove canonization of tmp tables

4. Disable some tests with Python (fails now, will fix in separate ticket YQL-19341)
commit_hash:6de642885f0451505c23c0a1319286c36393bf5c
---
 .../suites/Filter/LMapCombineWithFilter.cfg   |  2 +-
 .../suites/Filter/LMapCombineWithFilter.yql   |  2 +-
 .../suites/Filter/LMapWithFilter.cfg          |  2 +-
 .../suites/Filter/LMapWithFilter.yql          |  2 +-
 .../suites/ManyYamrOperations/Bug1465.cfg     |  2 +-
 .../Optimizers/FuseLMapAfterLReduce.cfg       |  3 ++-
 .../Optimizers/FuseLMapAfterLReduce.yql       |  8 +++----
 .../suites/Optimizers/FuseLMapAfterReduce.cfg |  3 ++-
 .../suites/Optimizers/FuseLMapAfterReduce.yql |  4 ++--
 .../suites/Optimizers/FuseMapAfterLReduce.cfg |  3 ++-
 .../suites/Optimizers/FuseMapAfterLReduce.yql | 12 +++++-----
 .../Optimizers/PartitionByWithInnerSort.cfg   |  2 +-
 .../Optimizers/PartitionByWithInnerSort.yql   |  2 +-
 .../Optimizers/PartitionsByWithInnerSort.cfg  |  2 +-
 .../s-expressions/suites/Udf/AutoMapMany.cfg  |  2 +-
 .../s-expressions/suites/Udf/AutoMapMany.yql  | 16 ++++++-------
 .../suites/Udf/AutoMapManyNamed.cfg           |  2 +-
 .../suites/Udf/AutoMapManyNamed.yql           | 24 +++++++++----------
 .../s-expressions/suites/Udf/PythonAvg.cfg    |  2 +-
 .../s-expressions/suites/Udf/PythonAvg.yql    |  6 ++---
 .../suites/Udf/PythonCallableAsArg.cfg        |  2 +-
 .../suites/Udf/PythonCallableAsArg.yql        |  4 ++--
 .../suites/Udf/PythonCallableAsResult.cfg     |  2 +-
 .../suites/Udf/PythonCallableAsResult.yql     |  2 +-
 .../suites/Udf/PythonGenerator.cfg            |  2 +-
 .../suites/Udf/PythonGenerator.yql            |  6 ++---
 .../Udf/PythonGeneratorExprWithClosure.cfg    |  2 +-
 .../Udf/PythonGeneratorExprWithClosure.yql    |  4 ++--
 .../suites/Udf/PythonGeneratorWithClosure.cfg |  2 +-
 .../suites/Udf/PythonGeneratorWithClosure.yql |  4 ++--
 .../suites/Udf/PythonPartialCall.cfg          |  2 +-
 .../suites/Udf/PythonPartialCall.yql          |  2 +-
 .../s-expressions/suites/Udf/PythonStruct.cfg |  2 +-
 .../s-expressions/suites/Udf/PythonStruct.yql |  2 +-
 .../s-expressions/suites/Udf/PythonSum.cfg    |  2 +-
 .../s-expressions/suites/Udf/PythonSum.yql    |  2 +-
 .../suites/View/ViewWithUdfProcess.cfg        |  2 +-
 .../s-expressions/suites/View/input6.txt.attr |  2 +-
 38 files changed, 75 insertions(+), 72 deletions(-)

diff --git a/yql/essentials/tests/s-expressions/suites/Filter/LMapCombineWithFilter.cfg b/yql/essentials/tests/s-expressions/suites/Filter/LMapCombineWithFilter.cfg
index 0bbf3ad9f029..aa782f323cb1 100644
--- a/yql/essentials/tests/s-expressions/suites/Filter/LMapCombineWithFilter.cfg
+++ b/yql/essentials/tests/s-expressions/suites/Filter/LMapCombineWithFilter.cfg
@@ -1,4 +1,4 @@
 in Input input.txt
 out Output output.txt
 res result.txt
-udf python2_udf
+udf python3_udf
diff --git a/yql/essentials/tests/s-expressions/suites/Filter/LMapCombineWithFilter.yql b/yql/essentials/tests/s-expressions/suites/Filter/LMapCombineWithFilter.yql
index 887bad33500e..d6e6d8fe10cf 100644
--- a/yql/essentials/tests/s-expressions/suites/Filter/LMapCombineWithFilter.yql
+++ b/yql/essentials/tests/s-expressions/suites/Filter/LMapCombineWithFilter.yql
@@ -23,7 +23,7 @@
 def MyFunc(list):
     return list
 @@))
-(let udf (ScriptUdf 'Python 'MyFunc udfType udfScript))
+(let udf (ScriptUdf 'Python3 'MyFunc udfType udfScript))
 
 # filter keys less than '100'
 (let table (Filter table (lambda '(item)
diff --git a/yql/essentials/tests/s-expressions/suites/Filter/LMapWithFilter.cfg b/yql/essentials/tests/s-expressions/suites/Filter/LMapWithFilter.cfg
index 0bbf3ad9f029..aa782f323cb1 100644
--- a/yql/essentials/tests/s-expressions/suites/Filter/LMapWithFilter.cfg
+++ b/yql/essentials/tests/s-expressions/suites/Filter/LMapWithFilter.cfg
@@ -1,4 +1,4 @@
 in Input input.txt
 out Output output.txt
 res result.txt
-udf python2_udf
+udf python3_udf
diff --git a/yql/essentials/tests/s-expressions/suites/Filter/LMapWithFilter.yql b/yql/essentials/tests/s-expressions/suites/Filter/LMapWithFilter.yql
index 68bb0ad6daaf..f5981f92ac41 100644
--- a/yql/essentials/tests/s-expressions/suites/Filter/LMapWithFilter.yql
+++ b/yql/essentials/tests/s-expressions/suites/Filter/LMapWithFilter.yql
@@ -23,7 +23,7 @@
 def MyFunc(list):
     return list
 @@))
-(let udf (ScriptUdf 'Python 'MyFunc udfType udfScript '('('cpu '"5.0") '('extraMem '"12345"))))
+(let udf (ScriptUdf 'Python3 'MyFunc udfType udfScript '('('cpu '"5.0") '('extraMem '"12345"))))
 
 # filter keys less than '100'
 (let table (Filter table (lambda '(item)
diff --git a/yql/essentials/tests/s-expressions/suites/ManyYamrOperations/Bug1465.cfg b/yql/essentials/tests/s-expressions/suites/ManyYamrOperations/Bug1465.cfg
index ac8f608e62d2..f4ca3c4cf6c5 100644
--- a/yql/essentials/tests/s-expressions/suites/ManyYamrOperations/Bug1465.cfg
+++ b/yql/essentials/tests/s-expressions/suites/ManyYamrOperations/Bug1465.cfg
@@ -3,4 +3,4 @@ out out1 out1.txt
 out out2 out2.txt
 res result.txt
 udf url_udf
-udf python2_udf
+udf python3_udf
diff --git a/yql/essentials/tests/s-expressions/suites/Optimizers/FuseLMapAfterLReduce.cfg b/yql/essentials/tests/s-expressions/suites/Optimizers/FuseLMapAfterLReduce.cfg
index 0bbf3ad9f029..62b2a5fe18f1 100644
--- a/yql/essentials/tests/s-expressions/suites/Optimizers/FuseLMapAfterLReduce.cfg
+++ b/yql/essentials/tests/s-expressions/suites/Optimizers/FuseLMapAfterLReduce.cfg
@@ -1,4 +1,5 @@
 in Input input.txt
 out Output output.txt
 res result.txt
-udf python2_udf
+udf python3_udf
+providers dummy
diff --git a/yql/essentials/tests/s-expressions/suites/Optimizers/FuseLMapAfterLReduce.yql b/yql/essentials/tests/s-expressions/suites/Optimizers/FuseLMapAfterLReduce.yql
index cc5b60392b94..3df99f0f47d3 100644
--- a/yql/essentials/tests/s-expressions/suites/Optimizers/FuseLMapAfterLReduce.yql
+++ b/yql/essentials/tests/s-expressions/suites/Optimizers/FuseLMapAfterLReduce.yql
@@ -25,7 +25,7 @@
             (return select)
           )))
           (let core (LMap inputRowsList (lambda '(x) (block '(
-            (let inputType (CallableArgumentType (TypeOf (ScriptUdf 'Python '"f" (CallableType '() '((StreamType (StructType '('"cnt" (DataType 'Uint64)) '('"pass" (DataType 'Int32)) '('"skey" (DataType 'Uint32))))) '((StreamType (StructType '('"cnt" (DataType 'Uint64)) '('"skey" (DataType 'Uint32))))) '((DataType 'Int32))) (String '@@
+            (let inputType (CallableArgumentType (TypeOf (ScriptUdf 'Python3 '"f" (CallableType '() '((StreamType (StructType '('"cnt" (DataType 'Uint64)) '('"pass" (DataType 'Int32)) '('"skey" (DataType 'Uint32))))) '((StreamType (StructType '('"cnt" (DataType 'Uint64)) '('"skey" (DataType 'Uint32))))) '((DataType 'Int32))) (String '@@
 def f(input,x):
    for i in list(input):
       d = i.__dict__
@@ -33,7 +33,7 @@ def f(input,x):
       yield d
 @@))) '0))
             (let inputRowsList (MatchType inputType 'List (lambda '(y) (ForwardList x)) (lambda '(y) "x")))
-            (return (Apply (ScriptUdf 'Python '"f" (CallableType '() '((StreamType (StructType '('"cnt" (DataType 'Uint64)) '('"pass" (DataType 'Int32)) '('"skey" (DataType 'Uint32))))) '((StreamType (StructType '('"cnt" (DataType 'Uint64)) '('"skey" (DataType 'Uint32))))) '((DataType 'Int32))) (String '@@
+            (return (Apply (ScriptUdf 'Python3 '"f" (CallableType '() '((StreamType (StructType '('"cnt" (DataType 'Uint64)) '('"pass" (DataType 'Int32)) '('"skey" (DataType 'Uint32))))) '((StreamType (StructType '('"cnt" (DataType 'Uint64)) '('"skey" (DataType 'Uint32))))) '((DataType 'Int32))) (String '@@
 def f(input,x):
    for i in list(input):
       d = i.__dict__
@@ -46,7 +46,7 @@ def f(input,x):
         (return select)
       )))
       (let core (LMap inputRowsList (lambda '(x) (block '(
-        (let inputType (CallableArgumentType (TypeOf (ScriptUdf 'Python '"f" (CallableType '() '((StreamType (StructType '('"cnt" (DataType 'Uint64)) '('"pass" (DataType 'Int32)) '('"skey" (DataType 'Uint32))))) '((StreamType (StructType '('"cnt" (DataType 'Uint64)) '('"pass" (DataType 'Int32)) '('"skey" (DataType 'Uint32))))) '((DataType 'Int32))) (String '@@
+        (let inputType (CallableArgumentType (TypeOf (ScriptUdf 'Python3 '"f" (CallableType '() '((StreamType (StructType '('"cnt" (DataType 'Uint64)) '('"pass" (DataType 'Int32)) '('"skey" (DataType 'Uint32))))) '((StreamType (StructType '('"cnt" (DataType 'Uint64)) '('"pass" (DataType 'Int32)) '('"skey" (DataType 'Uint32))))) '((DataType 'Int32))) (String '@@
 def f(input,x):
    for i in list(input):
       d = i.__dict__
@@ -54,7 +54,7 @@ def f(input,x):
       yield d
 @@))) '0))
         (let inputRowsList (MatchType inputType 'List (lambda '(y) (ForwardList x)) (lambda '(y) "x")))
-        (return (Apply (ScriptUdf 'Python '"f" (CallableType '() '((StreamType (StructType '('"cnt" (DataType 'Uint64)) '('"pass" (DataType 'Int32)) '('"skey" (DataType 'Uint32))))) '((StreamType (StructType '('"cnt" (DataType 'Uint64)) '('"pass" (DataType 'Int32)) '('"skey" (DataType 'Uint32))))) '((DataType 'Int32))) (String '@@
+        (return (Apply (ScriptUdf 'Python3 '"f" (CallableType '() '((StreamType (StructType '('"cnt" (DataType 'Uint64)) '('"pass" (DataType 'Int32)) '('"skey" (DataType 'Uint32))))) '((StreamType (StructType '('"cnt" (DataType 'Uint64)) '('"pass" (DataType 'Int32)) '('"skey" (DataType 'Uint32))))) '((DataType 'Int32))) (String '@@
 def f(input,x):
    for i in list(input):
       d = i.__dict__
diff --git a/yql/essentials/tests/s-expressions/suites/Optimizers/FuseLMapAfterReduce.cfg b/yql/essentials/tests/s-expressions/suites/Optimizers/FuseLMapAfterReduce.cfg
index 0bbf3ad9f029..62b2a5fe18f1 100644
--- a/yql/essentials/tests/s-expressions/suites/Optimizers/FuseLMapAfterReduce.cfg
+++ b/yql/essentials/tests/s-expressions/suites/Optimizers/FuseLMapAfterReduce.cfg
@@ -1,4 +1,5 @@
 in Input input.txt
 out Output output.txt
 res result.txt
-udf python2_udf
+udf python3_udf
+providers dummy
diff --git a/yql/essentials/tests/s-expressions/suites/Optimizers/FuseLMapAfterReduce.yql b/yql/essentials/tests/s-expressions/suites/Optimizers/FuseLMapAfterReduce.yql
index 1f806dc3a36f..3e8637fc5438 100644
--- a/yql/essentials/tests/s-expressions/suites/Optimizers/FuseLMapAfterReduce.yql
+++ b/yql/essentials/tests/s-expressions/suites/Optimizers/FuseLMapAfterReduce.yql
@@ -23,7 +23,7 @@
         (return select)
       )))
       (let core (LMap inputRowsList (lambda '(x) (block '(
-        (let inputType (CallableArgumentType (TypeOf (ScriptUdf 'Python '"f" (CallableType '() '((StreamType (StructType '('"cnt" (DataType 'Uint64)) '('"pass" (DataType 'Int32)) '('"skey" (DataType 'Uint32))))) '((StreamType (StructType '('"cnt" (DataType 'Uint64)) '('"skey" (DataType 'Uint32))))) '((DataType 'Int32))) (String '@@
+        (let inputType (CallableArgumentType (TypeOf (ScriptUdf 'Python3 '"f" (CallableType '() '((StreamType (StructType '('"cnt" (DataType 'Uint64)) '('"pass" (DataType 'Int32)) '('"skey" (DataType 'Uint32))))) '((StreamType (StructType '('"cnt" (DataType 'Uint64)) '('"skey" (DataType 'Uint32))))) '((DataType 'Int32))) (String '@@
 def f(input,x):
    for i in list(input):
       d = i.__dict__
@@ -31,7 +31,7 @@ def f(input,x):
       yield d
 @@))) '0))
         (let inputRowsList (MatchType inputType 'List (lambda '(y) (ForwardList x)) (lambda '(y) "x")))
-        (return (Apply (ScriptUdf 'Python '"f" (CallableType '() '((StreamType (StructType '('"cnt" (DataType 'Uint64)) '('"pass" (DataType 'Int32)) '('"skey" (DataType 'Uint32))))) '((StreamType (StructType '('"cnt" (DataType 'Uint64)) '('"skey" (DataType 'Uint32))))) '((DataType 'Int32))) (String '@@
+        (return (Apply (ScriptUdf 'Python3 '"f" (CallableType '() '((StreamType (StructType '('"cnt" (DataType 'Uint64)) '('"pass" (DataType 'Int32)) '('"skey" (DataType 'Uint32))))) '((StreamType (StructType '('"cnt" (DataType 'Uint64)) '('"skey" (DataType 'Uint32))))) '((DataType 'Int32))) (String '@@
 def f(input,x):
    for i in list(input):
       d = i.__dict__
diff --git a/yql/essentials/tests/s-expressions/suites/Optimizers/FuseMapAfterLReduce.cfg b/yql/essentials/tests/s-expressions/suites/Optimizers/FuseMapAfterLReduce.cfg
index 0bbf3ad9f029..62b2a5fe18f1 100644
--- a/yql/essentials/tests/s-expressions/suites/Optimizers/FuseMapAfterLReduce.cfg
+++ b/yql/essentials/tests/s-expressions/suites/Optimizers/FuseMapAfterLReduce.cfg
@@ -1,4 +1,5 @@
 in Input input.txt
 out Output output.txt
 res result.txt
-udf python2_udf
+udf python3_udf
+providers dummy
diff --git a/yql/essentials/tests/s-expressions/suites/Optimizers/FuseMapAfterLReduce.yql b/yql/essentials/tests/s-expressions/suites/Optimizers/FuseMapAfterLReduce.yql
index ea138e481ade..5656d60fc3a1 100644
--- a/yql/essentials/tests/s-expressions/suites/Optimizers/FuseMapAfterLReduce.yql
+++ b/yql/essentials/tests/s-expressions/suites/Optimizers/FuseMapAfterLReduce.yql
@@ -25,24 +25,24 @@
             (return select)
           )))
           (let core (LMap inputRowsList (lambda '(x) (block '(
-            (let inputType (CallableArgumentType (TypeOf (ScriptUdf 'Python '"f" (CallableType '() '((StreamType (StructType '('"cnt" (DataType 'Uint64)) '('"pass" (DataType 'Int32)) '('"skey" (DataType 'Uint32))))) '((StreamType (StructType '('"cnt" (DataType 'Uint64)) '('"skey" (DataType 'Uint32))))) '((DataType 'Int32))) (String '@@
+            (let inputType (CallableArgumentType (TypeOf (ScriptUdf 'Python3 '"f" (CallableType '() '((StreamType (StructType '('"cnt" (DataType 'Uint64)) '('"pass" (DataType 'Int32)) '('"skey" (DataType 'Uint32))))) '((StreamType (StructType '('"cnt" (DataType 'Uint64)) '('"skey" (DataType 'Uint32))))) '((DataType 'Int32))) (String '@@
 def f(input,x):
    for i in list(input):
       d = i.__dict__
       d["pass"] = x
       yield d
-      
+
 def f2(a,b,c):
    return {"a":a,"b":b,"c":c}
 @@))) '0))
             (let inputRowsList (MatchType inputType 'List (lambda '(y) (ForwardList x)) (lambda '(y) "x")))
-            (return (Apply (ScriptUdf 'Python '"f" (CallableType '() '((StreamType (StructType '('"cnt" (DataType 'Uint64)) '('"pass" (DataType 'Int32)) '('"skey" (DataType 'Uint32))))) '((StreamType (StructType '('"cnt" (DataType 'Uint64)) '('"skey" (DataType 'Uint32))))) '((DataType 'Int32))) (String '@@
+            (return (Apply (ScriptUdf 'Python3 '"f" (CallableType '() '((StreamType (StructType '('"cnt" (DataType 'Uint64)) '('"pass" (DataType 'Int32)) '('"skey" (DataType 'Uint32))))) '((StreamType (StructType '('"cnt" (DataType 'Uint64)) '('"skey" (DataType 'Uint32))))) '((DataType 'Int32))) (String '@@
 def f(input,x):
    for i in list(input):
       d = i.__dict__
       d["pass"] = x
       yield d
-      
+
 def f2(a,b,c):
    return {"a":a,"b":b,"c":c}
 @@)) inputRowsList (Int32 '"1")))
@@ -52,13 +52,13 @@ def f2(a,b,c):
         (return select)
       )))
       (let core (FlatMap core (lambda '(row) (block '(
-        (let res (Apply (ScriptUdf 'Python '"f2" (CallableType '() '((StructType '('"a" (DataType 'Uint32)) '('"b" (DataType 'Uint64)) '('"c" (DataType 'Int32)))) '((DataType 'Uint32) 'skey) '((DataType 'Uint64) 'cnt) '((DataType 'Int32) 'pass)) (String '@@
+        (let res (Apply (ScriptUdf 'Python3 '"f2" (CallableType '() '((StructType '('"a" (DataType 'Uint32)) '('"b" (DataType 'Uint64)) '('"c" (DataType 'Int32)))) '((DataType 'Uint32) 'skey) '((DataType 'Uint64) 'cnt) '((DataType 'Int32) 'pass)) (String '@@
 def f(input,x):
    for i in list(input):
       d = i.__dict__
       d["pass"] = x
       yield d
-      
+
 def f2(a,b,c):
    return {"a":a,"b":b,"c":c}
 @@)) (Member row '"skey") (Member row '"cnt") (Member row '"pass")))
diff --git a/yql/essentials/tests/s-expressions/suites/Optimizers/PartitionByWithInnerSort.cfg b/yql/essentials/tests/s-expressions/suites/Optimizers/PartitionByWithInnerSort.cfg
index bc0dfe8e1843..f3abc496dd82 100644
--- a/yql/essentials/tests/s-expressions/suites/Optimizers/PartitionByWithInnerSort.cfg
+++ b/yql/essentials/tests/s-expressions/suites/Optimizers/PartitionByWithInnerSort.cfg
@@ -1,4 +1,4 @@
 in Input SomeKeys.txt
 out Output output.txt
 res result.txt
-udf python2_udf
+udf python3_udf
diff --git a/yql/essentials/tests/s-expressions/suites/Optimizers/PartitionByWithInnerSort.yql b/yql/essentials/tests/s-expressions/suites/Optimizers/PartitionByWithInnerSort.yql
index a245de4f1825..7b776d078bc2 100644
--- a/yql/essentials/tests/s-expressions/suites/Optimizers/PartitionByWithInnerSort.yql
+++ b/yql/essentials/tests/s-expressions/suites/Optimizers/PartitionByWithInnerSort.yql
@@ -8,7 +8,7 @@ def MyTuple(one, two):
     return (one, two)
 
 @@))
-    (let udf (ScriptUdf 'Python 'MyTuple udfType udfScript))
+    (let udf (ScriptUdf 'Python3 'MyTuple udfType udfScript))
 
     (let mr_source (DataSource 'yt 'plato))
     (let x (Read! world mr_source (Key '('table (String 'Input))) (Void) '()))
diff --git a/yql/essentials/tests/s-expressions/suites/Optimizers/PartitionsByWithInnerSort.cfg b/yql/essentials/tests/s-expressions/suites/Optimizers/PartitionsByWithInnerSort.cfg
index bc0dfe8e1843..f3abc496dd82 100644
--- a/yql/essentials/tests/s-expressions/suites/Optimizers/PartitionsByWithInnerSort.cfg
+++ b/yql/essentials/tests/s-expressions/suites/Optimizers/PartitionsByWithInnerSort.cfg
@@ -1,4 +1,4 @@
 in Input SomeKeys.txt
 out Output output.txt
 res result.txt
-udf python2_udf
+udf python3_udf
diff --git a/yql/essentials/tests/s-expressions/suites/Udf/AutoMapMany.cfg b/yql/essentials/tests/s-expressions/suites/Udf/AutoMapMany.cfg
index 0bbf3ad9f029..aa782f323cb1 100644
--- a/yql/essentials/tests/s-expressions/suites/Udf/AutoMapMany.cfg
+++ b/yql/essentials/tests/s-expressions/suites/Udf/AutoMapMany.cfg
@@ -1,4 +1,4 @@
 in Input input.txt
 out Output output.txt
 res result.txt
-udf python2_udf
+udf python3_udf
diff --git a/yql/essentials/tests/s-expressions/suites/Udf/AutoMapMany.yql b/yql/essentials/tests/s-expressions/suites/Udf/AutoMapMany.yql
index e179419a5c86..6e29b890b984 100644
--- a/yql/essentials/tests/s-expressions/suites/Udf/AutoMapMany.yql
+++ b/yql/essentials/tests/s-expressions/suites/Udf/AutoMapMany.yql
@@ -42,7 +42,7 @@ def f(x,y,z):
     (let select (block '(
       (let core (AsList (Uint32 '0)))
       (let core (FlatMap core (lambda '(row) (block '(
-        (let res (AsStruct '('"column0" ("Apply" (ScriptUdf 'Python '"f" (CallableType '() '((DataType 'Int32)) '((DataType 'Int32) '"" '1) '((DataType 'Int32) '"" '1) '((DataType 'Int32) '"" '1)) scriptudf0) (Int32 '"1") (Int32 '"2") (Int32 '"3")))))
+        (let res (AsStruct '('"column0" ("Apply" (ScriptUdf 'Python3 '"f" (CallableType '() '((DataType 'Int32)) '((DataType 'Int32) '"" '1) '((DataType 'Int32) '"" '1) '((DataType 'Int32) '"" '1)) scriptudf0) (Int32 '"1") (Int32 '"2") (Int32 '"3")))))
         (let res (AsList res))
         (return res)
       )))))
@@ -62,7 +62,7 @@ def f(x,y,z):
     (let select (block '(
       (let core (AsList (Uint32 '0)))
       (let core (FlatMap core (lambda '(row) (block '(
-        (let res (AsStruct '('"column0" ("Apply" (ScriptUdf 'Python '"f" (CallableType '() '((DataType 'Int32)) '((DataType 'Int32) '"" '1) '((DataType 'Int32) '"" '1) '((DataType 'Int32) '"" '1)) scriptudf1) ("Just" (Int32 '"1")) (Int32 '"2") (Int32 '"3")))))
+        (let res (AsStruct '('"column0" ("Apply" (ScriptUdf 'Python3 '"f" (CallableType '() '((DataType 'Int32)) '((DataType 'Int32) '"" '1) '((DataType 'Int32) '"" '1) '((DataType 'Int32) '"" '1)) scriptudf1) ("Just" (Int32 '"1")) (Int32 '"2") (Int32 '"3")))))
         (let res (AsList res))
         (return res)
       )))))
@@ -82,7 +82,7 @@ def f(x,y,z):
     (let select (block '(
       (let core (AsList (Uint32 '0)))
       (let core (FlatMap core (lambda '(row) (block '(
-        (let res (AsStruct '('"column0" ("Apply" (ScriptUdf 'Python '"f" (CallableType '() '((DataType 'Int32)) '((DataType 'Int32) '"" '1) '((DataType 'Int32) '"" '1) '((DataType 'Int32) '"" '1)) scriptudf2) ("Just" (Int32 '"1")) ("Just" (Int32 '"2")) (Int32 '"3")))))
+        (let res (AsStruct '('"column0" ("Apply" (ScriptUdf 'Python3 '"f" (CallableType '() '((DataType 'Int32)) '((DataType 'Int32) '"" '1) '((DataType 'Int32) '"" '1) '((DataType 'Int32) '"" '1)) scriptudf2) ("Just" (Int32 '"1")) ("Just" (Int32 '"2")) (Int32 '"3")))))
         (let res (AsList res))
         (return res)
       )))))
@@ -102,7 +102,7 @@ def f(x,y,z):
     (let select (block '(
       (let core (AsList (Uint32 '0)))
       (let core (FlatMap core (lambda '(row) (block '(
-        (let res (AsStruct '('"column0" ("Apply" (ScriptUdf 'Python '"f" (CallableType '() '((DataType 'Int32)) '((DataType 'Int32) '"" '1) '((DataType 'Int32) '"" '1) '((DataType 'Int32) '"" '1)) scriptudf3) ("Just" (Int32 '"1")) ("Just" (Int32 '"2")) ("Just" (Int32 '"3"))))))
+        (let res (AsStruct '('"column0" ("Apply" (ScriptUdf 'Python3 '"f" (CallableType '() '((DataType 'Int32)) '((DataType 'Int32) '"" '1) '((DataType 'Int32) '"" '1) '((DataType 'Int32) '"" '1)) scriptudf3) ("Just" (Int32 '"1")) ("Just" (Int32 '"2")) ("Just" (Int32 '"3"))))))
         (let res (AsList res))
         (return res)
       )))))
@@ -122,7 +122,7 @@ def f(x,y,z):
     (let select (block '(
       (let core (AsList (Uint32 '0)))
       (let core (FlatMap core (lambda '(row) (block '(
-        (let res (AsStruct '('"column0" ("Apply" (ScriptUdf 'Python '"f" (CallableType '() '((OptionalType (DataType 'Int32))) '((DataType 'Int32) '"" '1) '((DataType 'Int32) '"" '1) '((DataType 'Int32) '"" '1)) scriptudf4) (Int32 '"4") (Int32 '"5") (Int32 '"6")))))
+        (let res (AsStruct '('"column0" ("Apply" (ScriptUdf 'Python3 '"f" (CallableType '() '((OptionalType (DataType 'Int32))) '((DataType 'Int32) '"" '1) '((DataType 'Int32) '"" '1) '((DataType 'Int32) '"" '1)) scriptudf4) (Int32 '"4") (Int32 '"5") (Int32 '"6")))))
         (let res (AsList res))
         (return res)
       )))))
@@ -142,7 +142,7 @@ def f(x,y,z):
     (let select (block '(
       (let core (AsList (Uint32 '0)))
       (let core (FlatMap core (lambda '(row) (block '(
-        (let res (AsStruct '('"column0" ("Apply" (ScriptUdf 'Python '"f" (CallableType '() '((OptionalType (DataType 'Int32))) '((DataType 'Int32) '"" '1) '((DataType 'Int32) '"" '1) '((DataType 'Int32) '"" '1)) scriptudf5) ("Just" (Int32 '"4")) (Int32 '"5") (Int32 '"6")))))
+        (let res (AsStruct '('"column0" ("Apply" (ScriptUdf 'Python3 '"f" (CallableType '() '((OptionalType (DataType 'Int32))) '((DataType 'Int32) '"" '1) '((DataType 'Int32) '"" '1) '((DataType 'Int32) '"" '1)) scriptudf5) ("Just" (Int32 '"4")) (Int32 '"5") (Int32 '"6")))))
         (let res (AsList res))
         (return res)
       )))))
@@ -162,7 +162,7 @@ def f(x,y,z):
     (let select (block '(
       (let core (AsList (Uint32 '0)))
       (let core (FlatMap core (lambda '(row) (block '(
-        (let res (AsStruct '('"column0" ("Apply" (ScriptUdf 'Python '"f" (CallableType '() '((OptionalType (DataType 'Int32))) '((DataType 'Int32) '"" '1) '((DataType 'Int32) '"" '1) '((DataType 'Int32) '"" '1)) scriptudf6) ("Just" (Int32 '"4")) ("Just" (Int32 '"5")) (Int32 '"6")))))
+        (let res (AsStruct '('"column0" ("Apply" (ScriptUdf 'Python3 '"f" (CallableType '() '((OptionalType (DataType 'Int32))) '((DataType 'Int32) '"" '1) '((DataType 'Int32) '"" '1) '((DataType 'Int32) '"" '1)) scriptudf6) ("Just" (Int32 '"4")) ("Just" (Int32 '"5")) (Int32 '"6")))))
         (let res (AsList res))
         (return res)
       )))))
@@ -182,7 +182,7 @@ def f(x,y,z):
     (let select (block '(
       (let core (AsList (Uint32 '0)))
       (let core (FlatMap core (lambda '(row) (block '(
-        (let res (AsStruct '('"column0" ("Apply" (ScriptUdf 'Python '"f" (CallableType '() '((OptionalType (DataType 'Int32))) '((DataType 'Int32) '"" '1) '((DataType 'Int32) '"" '1) '((DataType 'Int32) '"" '1)) scriptudf7) ("Just" (Int32 '"4")) ("Just" (Int32 '"5")) ("Just" (Int32 '"6"))))))
+        (let res (AsStruct '('"column0" ("Apply" (ScriptUdf 'Python3 '"f" (CallableType '() '((OptionalType (DataType 'Int32))) '((DataType 'Int32) '"" '1) '((DataType 'Int32) '"" '1) '((DataType 'Int32) '"" '1)) scriptudf7) ("Just" (Int32 '"4")) ("Just" (Int32 '"5")) ("Just" (Int32 '"6"))))))
         (let res (AsList res))
         (return res)
       )))))
diff --git a/yql/essentials/tests/s-expressions/suites/Udf/AutoMapManyNamed.cfg b/yql/essentials/tests/s-expressions/suites/Udf/AutoMapManyNamed.cfg
index 0bbf3ad9f029..aa782f323cb1 100644
--- a/yql/essentials/tests/s-expressions/suites/Udf/AutoMapManyNamed.cfg
+++ b/yql/essentials/tests/s-expressions/suites/Udf/AutoMapManyNamed.cfg
@@ -1,4 +1,4 @@
 in Input input.txt
 out Output output.txt
 res result.txt
-udf python2_udf
+udf python3_udf
diff --git a/yql/essentials/tests/s-expressions/suites/Udf/AutoMapManyNamed.yql b/yql/essentials/tests/s-expressions/suites/Udf/AutoMapManyNamed.yql
index 2225ad9b7e1b..64c30fb14288 100644
--- a/yql/essentials/tests/s-expressions/suites/Udf/AutoMapManyNamed.yql
+++ b/yql/essentials/tests/s-expressions/suites/Udf/AutoMapManyNamed.yql
@@ -58,7 +58,7 @@ def f(x,y,z):
     (let select (block '(
       (let core (AsList (Uint32 '0)))
       (let core (FlatMap core (lambda '(row) (block '(
-        (let res (AsStruct '('"column0" ("NamedApply" (ScriptUdf 'Python '"f" (CallableType '() '((DataType 'Int32)) '((DataType 'Int32) 'x '1) '((DataType 'Int32) 'y '1) '((DataType 'Int32) 'z '1)) scriptudf0) '() (AsStruct '('x (Int32 '"1")) '('y (Int32 '"2")) '('z (Int32 '"3")))))))
+        (let res (AsStruct '('"column0" ("NamedApply" (ScriptUdf 'Python3 '"f" (CallableType '() '((DataType 'Int32)) '((DataType 'Int32) 'x '1) '((DataType 'Int32) 'y '1) '((DataType 'Int32) 'z '1)) scriptudf0) '() (AsStruct '('x (Int32 '"1")) '('y (Int32 '"2")) '('z (Int32 '"3")))))))
         (let res (AsList res))
         (return res)
       )))))
@@ -78,7 +78,7 @@ def f(x,y,z):
     (let select (block '(
       (let core (AsList (Uint32 '0)))
       (let core (FlatMap core (lambda '(row) (block '(
-        (let res (AsStruct '('"column0" ("NamedApply" (ScriptUdf 'Python '"f" (CallableType '() '((DataType 'Int32)) '((DataType 'Int32) 'x '1) '((DataType 'Int32) 'y '1) '((DataType 'Int32) 'z '1)) scriptudf1) '() (AsStruct '('x ("Just" (Int32 '"1"))) '('y (Int32 '"2")) '('z (Int32 '"3")))))))
+        (let res (AsStruct '('"column0" ("NamedApply" (ScriptUdf 'Python3 '"f" (CallableType '() '((DataType 'Int32)) '((DataType 'Int32) 'x '1) '((DataType 'Int32) 'y '1) '((DataType 'Int32) 'z '1)) scriptudf1) '() (AsStruct '('x ("Just" (Int32 '"1"))) '('y (Int32 '"2")) '('z (Int32 '"3")))))))
         (let res (AsList res))
         (return res)
       )))))
@@ -98,7 +98,7 @@ def f(x,y,z):
     (let select (block '(
       (let core (AsList (Uint32 '0)))
       (let core (FlatMap core (lambda '(row) (block '(
-        (let res (AsStruct '('"column0" ("NamedApply" (ScriptUdf 'Python '"f" (CallableType '() '((DataType 'Int32)) '((DataType 'Int32) 'x '1) '((DataType 'Int32) 'y '1) '((DataType 'Int32) 'z '1)) scriptudf2) '() (AsStruct '('x ("Just" (Int32 '"1"))) '('y ("Just" (Int32 '"2"))) '('z (Int32 '"3")))))))
+        (let res (AsStruct '('"column0" ("NamedApply" (ScriptUdf 'Python3 '"f" (CallableType '() '((DataType 'Int32)) '((DataType 'Int32) 'x '1) '((DataType 'Int32) 'y '1) '((DataType 'Int32) 'z '1)) scriptudf2) '() (AsStruct '('x ("Just" (Int32 '"1"))) '('y ("Just" (Int32 '"2"))) '('z (Int32 '"3")))))))
         (let res (AsList res))
         (return res)
       )))))
@@ -118,7 +118,7 @@ def f(x,y,z):
     (let select (block '(
       (let core (AsList (Uint32 '0)))
       (let core (FlatMap core (lambda '(row) (block '(
-        (let res (AsStruct '('"column0" ("NamedApply" (ScriptUdf 'Python '"f" (CallableType '() '((DataType 'Int32)) '((DataType 'Int32) 'x '1) '((DataType 'Int32) 'y '1) '((DataType 'Int32) 'z '1)) scriptudf3) '() (AsStruct '('x ("Just" (Int32 '"1"))) '('y ("Just" (Int32 '"2"))) '('z ("Just" (Int32 '"3"))))))))
+        (let res (AsStruct '('"column0" ("NamedApply" (ScriptUdf 'Python3 '"f" (CallableType '() '((DataType 'Int32)) '((DataType 'Int32) 'x '1) '((DataType 'Int32) 'y '1) '((DataType 'Int32) 'z '1)) scriptudf3) '() (AsStruct '('x ("Just" (Int32 '"1"))) '('y ("Just" (Int32 '"2"))) '('z ("Just" (Int32 '"3"))))))))
         (let res (AsList res))
         (return res)
       )))))
@@ -138,7 +138,7 @@ def f(x,y,z):
     (let select (block '(
       (let core (AsList (Uint32 '0)))
       (let core (FlatMap core (lambda '(row) (block '(
-        (let res (AsStruct '('"column0" ("NamedApply" (ScriptUdf 'Python '"f" (CallableType '() '((DataType 'Int32)) '((DataType 'Int32) 'x '1) '((DataType 'Int32) 'y '1) '((DataType 'Int32) 'z '1)) scriptudf4) '(("Just" (Int32 '"1"))) (AsStruct '('y ("Just" (Int32 '"2"))) '('z ("Just" (Int32 '"3"))))))))
+        (let res (AsStruct '('"column0" ("NamedApply" (ScriptUdf 'Python3 '"f" (CallableType '() '((DataType 'Int32)) '((DataType 'Int32) 'x '1) '((DataType 'Int32) 'y '1) '((DataType 'Int32) 'z '1)) scriptudf4) '(("Just" (Int32 '"1"))) (AsStruct '('y ("Just" (Int32 '"2"))) '('z ("Just" (Int32 '"3"))))))))
         (let res (AsList res))
         (return res)
       )))))
@@ -158,7 +158,7 @@ def f(x,y,z):
     (let select (block '(
       (let core (AsList (Uint32 '0)))
       (let core (FlatMap core (lambda '(row) (block '(
-        (let res (AsStruct '('"column0" ("NamedApply" (ScriptUdf 'Python '"f" (CallableType '() '((DataType 'Int32)) '((DataType 'Int32) 'x '1) '((DataType 'Int32) 'y '1) '((DataType 'Int32) 'z '1)) scriptudf5) '(("Just" (Int32 '"1")) ("Just" (Int32 '"2"))) (AsStruct '('z ("Just" (Int32 '"3"))))))))
+        (let res (AsStruct '('"column0" ("NamedApply" (ScriptUdf 'Python3 '"f" (CallableType '() '((DataType 'Int32)) '((DataType 'Int32) 'x '1) '((DataType 'Int32) 'y '1) '((DataType 'Int32) 'z '1)) scriptudf5) '(("Just" (Int32 '"1")) ("Just" (Int32 '"2"))) (AsStruct '('z ("Just" (Int32 '"3"))))))))
         (let res (AsList res))
         (return res)
       )))))
@@ -178,7 +178,7 @@ def f(x,y,z):
     (let select (block '(
       (let core (AsList (Uint32 '0)))
       (let core (FlatMap core (lambda '(row) (block '(
-        (let res (AsStruct '('"column0" ("NamedApply" (ScriptUdf 'Python '"f" (CallableType '() '((OptionalType (DataType 'Int32))) '((DataType 'Int32) 'x '1) '((DataType 'Int32) 'y '1) '((DataType 'Int32) 'z '1)) scriptudf6) '() (AsStruct '('x (Int32 '"4")) '('y (Int32 '"5")) '('z (Int32 '"6")))))))
+        (let res (AsStruct '('"column0" ("NamedApply" (ScriptUdf 'Python3 '"f" (CallableType '() '((OptionalType (DataType 'Int32))) '((DataType 'Int32) 'x '1) '((DataType 'Int32) 'y '1) '((DataType 'Int32) 'z '1)) scriptudf6) '() (AsStruct '('x (Int32 '"4")) '('y (Int32 '"5")) '('z (Int32 '"6")))))))
         (let res (AsList res))
         (return res)
       )))))
@@ -198,7 +198,7 @@ def f(x,y,z):
     (let select (block '(
       (let core (AsList (Uint32 '0)))
       (let core (FlatMap core (lambda '(row) (block '(
-        (let res (AsStruct '('"column0" ("NamedApply" (ScriptUdf 'Python '"f" (CallableType '() '((OptionalType (DataType 'Int32))) '((DataType 'Int32) 'x '1) '((DataType 'Int32) 'y '1) '((DataType 'Int32) 'z '1)) scriptudf7) '() (AsStruct '('x ("Just" (Int32 '"4"))) '('y (Int32 '"5")) '('z (Int32 '"6")))))))
+        (let res (AsStruct '('"column0" ("NamedApply" (ScriptUdf 'Python3 '"f" (CallableType '() '((OptionalType (DataType 'Int32))) '((DataType 'Int32) 'x '1) '((DataType 'Int32) 'y '1) '((DataType 'Int32) 'z '1)) scriptudf7) '() (AsStruct '('x ("Just" (Int32 '"4"))) '('y (Int32 '"5")) '('z (Int32 '"6")))))))
         (let res (AsList res))
         (return res)
       )))))
@@ -218,7 +218,7 @@ def f(x,y,z):
     (let select (block '(
       (let core (AsList (Uint32 '0)))
       (let core (FlatMap core (lambda '(row) (block '(
-        (let res (AsStruct '('"column0" ("NamedApply" (ScriptUdf 'Python '"f" (CallableType '() '((OptionalType (DataType 'Int32))) '((DataType 'Int32) 'x '1) '((DataType 'Int32) 'y '1) '((DataType 'Int32) 'z '1)) scriptudf8) '() (AsStruct '('x ("Just" (Int32 '"4"))) '('y ("Just" (Int32 '"5"))) '('z (Int32 '"6")))))))
+        (let res (AsStruct '('"column0" ("NamedApply" (ScriptUdf 'Python3 '"f" (CallableType '() '((OptionalType (DataType 'Int32))) '((DataType 'Int32) 'x '1) '((DataType 'Int32) 'y '1) '((DataType 'Int32) 'z '1)) scriptudf8) '() (AsStruct '('x ("Just" (Int32 '"4"))) '('y ("Just" (Int32 '"5"))) '('z (Int32 '"6")))))))
         (let res (AsList res))
         (return res)
       )))))
@@ -238,7 +238,7 @@ def f(x,y,z):
     (let select (block '(
       (let core (AsList (Uint32 '0)))
       (let core (FlatMap core (lambda '(row) (block '(
-        (let res (AsStruct '('"column0" ("NamedApply" (ScriptUdf 'Python '"f" (CallableType '() '((OptionalType (DataType 'Int32))) '((DataType 'Int32) 'x '1) '((DataType 'Int32) 'y '1) '((DataType 'Int32) 'z '1)) scriptudf9) '() (AsStruct '('x ("Just" (Int32 '"4"))) '('y ("Just" (Int32 '"5"))) '('z ("Just" (Int32 '"6"))))))))
+        (let res (AsStruct '('"column0" ("NamedApply" (ScriptUdf 'Python3 '"f" (CallableType '() '((OptionalType (DataType 'Int32))) '((DataType 'Int32) 'x '1) '((DataType 'Int32) 'y '1) '((DataType 'Int32) 'z '1)) scriptudf9) '() (AsStruct '('x ("Just" (Int32 '"4"))) '('y ("Just" (Int32 '"5"))) '('z ("Just" (Int32 '"6"))))))))
         (let res (AsList res))
         (return res)
       )))))
@@ -258,7 +258,7 @@ def f(x,y,z):
     (let select (block '(
       (let core (AsList (Uint32 '0)))
       (let core (FlatMap core (lambda '(row) (block '(
-        (let res (AsStruct '('"column0" ("NamedApply" (ScriptUdf 'Python '"f" (CallableType '() '((OptionalType (DataType 'Int32))) '((DataType 'Int32) 'x '1) '((DataType 'Int32) 'y '1) '((DataType 'Int32) 'z '1)) scriptudf10) '(("Just" (Int32 '"4"))) (AsStruct '('y ("Just" (Int32 '"5"))) '('z ("Just" (Int32 '"6"))))))))
+        (let res (AsStruct '('"column0" ("NamedApply" (ScriptUdf 'Python3 '"f" (CallableType '() '((OptionalType (DataType 'Int32))) '((DataType 'Int32) 'x '1) '((DataType 'Int32) 'y '1) '((DataType 'Int32) 'z '1)) scriptudf10) '(("Just" (Int32 '"4"))) (AsStruct '('y ("Just" (Int32 '"5"))) '('z ("Just" (Int32 '"6"))))))))
         (let res (AsList res))
         (return res)
       )))))
@@ -278,7 +278,7 @@ def f(x,y,z):
     (let select (block '(
       (let core (AsList (Uint32 '0)))
       (let core (FlatMap core (lambda '(row) (block '(
-        (let res (AsStruct '('"column0" ("NamedApply" (ScriptUdf 'Python '"f" (CallableType '() '((OptionalType (DataType 'Int32))) '((DataType 'Int32) 'x '1) '((DataType 'Int32) 'y '1) '((DataType 'Int32) 'z '1)) scriptudf11) '(("Just" (Int32 '"4")) ("Just" (Int32 '"5"))) (AsStruct '('z ("Just" (Int32 '"6"))))))))
+        (let res (AsStruct '('"column0" ("NamedApply" (ScriptUdf 'Python3 '"f" (CallableType '() '((OptionalType (DataType 'Int32))) '((DataType 'Int32) 'x '1) '((DataType 'Int32) 'y '1) '((DataType 'Int32) 'z '1)) scriptudf11) '(("Just" (Int32 '"4")) ("Just" (Int32 '"5"))) (AsStruct '('z ("Just" (Int32 '"6"))))))))
         (let res (AsList res))
         (return res)
       )))))
diff --git a/yql/essentials/tests/s-expressions/suites/Udf/PythonAvg.cfg b/yql/essentials/tests/s-expressions/suites/Udf/PythonAvg.cfg
index 0bbf3ad9f029..aa782f323cb1 100644
--- a/yql/essentials/tests/s-expressions/suites/Udf/PythonAvg.cfg
+++ b/yql/essentials/tests/s-expressions/suites/Udf/PythonAvg.cfg
@@ -1,4 +1,4 @@
 in Input input.txt
 out Output output.txt
 res result.txt
-udf python2_udf
+udf python3_udf
diff --git a/yql/essentials/tests/s-expressions/suites/Udf/PythonAvg.yql b/yql/essentials/tests/s-expressions/suites/Udf/PythonAvg.yql
index 62dc07db0543..9029469690fc 100644
--- a/yql/essentials/tests/s-expressions/suites/Udf/PythonAvg.yql
+++ b/yql/essentials/tests/s-expressions/suites/Udf/PythonAvg.yql
@@ -32,9 +32,9 @@ def avg_result(avg):
 
 @@))
 
-(let avgCreate (ScriptUdf 'Python 'avg_create (CallableType '() '(rt)) udfScript))
-(let avgAdd (ScriptUdf 'Python 'avg_add (CallableType '() '(rt) '(rt) '(ui32)) udfScript))
-(let avgResult (ScriptUdf 'Python 'avg_result (CallableType '() '(dbl) '(rt)) udfScript))
+(let avgCreate (ScriptUdf 'Python3 'avg_create (CallableType '() '(rt)) udfScript))
+(let avgAdd (ScriptUdf 'Python3 'avg_add (CallableType '() '(rt) '(rt) '(ui32)) udfScript))
+(let avgResult (ScriptUdf 'Python3 'avg_result (CallableType '() '(dbl) '(rt)) udfScript))
 
 # data
 (let x (Uint32 '10))
diff --git a/yql/essentials/tests/s-expressions/suites/Udf/PythonCallableAsArg.cfg b/yql/essentials/tests/s-expressions/suites/Udf/PythonCallableAsArg.cfg
index 0bffbd4f08e1..d24419e54eb0 100644
--- a/yql/essentials/tests/s-expressions/suites/Udf/PythonCallableAsArg.cfg
+++ b/yql/essentials/tests/s-expressions/suites/Udf/PythonCallableAsArg.cfg
@@ -1,5 +1,5 @@
 in Input input.txt
 out Output output.txt
 res result.txt
-udf python2_udf
+udf python3_udf
 udf simple_udf
diff --git a/yql/essentials/tests/s-expressions/suites/Udf/PythonCallableAsArg.yql b/yql/essentials/tests/s-expressions/suites/Udf/PythonCallableAsArg.yql
index 36a70677e4fd..29a7d26a0a7e 100644
--- a/yql/essentials/tests/s-expressions/suites/Udf/PythonCallableAsArg.yql
+++ b/yql/essentials/tests/s-expressions/suites/Udf/PythonCallableAsArg.yql
@@ -14,9 +14,9 @@
 (let udfType (CallableType '() '(str) '(funcType) '(ui64)))
 (let udfScript (String '@@
 def new_string(func, x):
-    return func('x', x) + ':' + func('y', x)
+    return func(b'x', x) + b':' + func(b'y', x)
 @@))
-(let udf (ScriptUdf 'Python 'new_string udfType udfScript))
+(let udf (ScriptUdf 'Python3 'new_string udfType udfScript))
 
 # call udf
 (let x (Uint64 '3))
diff --git a/yql/essentials/tests/s-expressions/suites/Udf/PythonCallableAsResult.cfg b/yql/essentials/tests/s-expressions/suites/Udf/PythonCallableAsResult.cfg
index 0bbf3ad9f029..aa782f323cb1 100644
--- a/yql/essentials/tests/s-expressions/suites/Udf/PythonCallableAsResult.cfg
+++ b/yql/essentials/tests/s-expressions/suites/Udf/PythonCallableAsResult.cfg
@@ -1,4 +1,4 @@
 in Input input.txt
 out Output output.txt
 res result.txt
-udf python2_udf
+udf python3_udf
diff --git a/yql/essentials/tests/s-expressions/suites/Udf/PythonCallableAsResult.yql b/yql/essentials/tests/s-expressions/suites/Udf/PythonCallableAsResult.yql
index 7a7a6a995a2f..887526632b82 100644
--- a/yql/essentials/tests/s-expressions/suites/Udf/PythonCallableAsResult.yql
+++ b/yql/essentials/tests/s-expressions/suites/Udf/PythonCallableAsResult.yql
@@ -19,7 +19,7 @@ def create_counter(start):
     return counter
 @@))
 
-(let udf (ScriptUdf 'Python 'create_counter udfType udfScript))
+(let udf (ScriptUdf 'Python3 'create_counter udfType udfScript))
 
 # call udf
 (let counter (Apply udf (Uint64 '1)))
diff --git a/yql/essentials/tests/s-expressions/suites/Udf/PythonGenerator.cfg b/yql/essentials/tests/s-expressions/suites/Udf/PythonGenerator.cfg
index 0bbf3ad9f029..aa782f323cb1 100644
--- a/yql/essentials/tests/s-expressions/suites/Udf/PythonGenerator.cfg
+++ b/yql/essentials/tests/s-expressions/suites/Udf/PythonGenerator.cfg
@@ -1,4 +1,4 @@
 in Input input.txt
 out Output output.txt
 res result.txt
-udf python2_udf
+udf python3_udf
diff --git a/yql/essentials/tests/s-expressions/suites/Udf/PythonGenerator.yql b/yql/essentials/tests/s-expressions/suites/Udf/PythonGenerator.yql
index d225a14c1bbd..b6e6a001716e 100644
--- a/yql/essentials/tests/s-expressions/suites/Udf/PythonGenerator.yql
+++ b/yql/essentials/tests/s-expressions/suites/Udf/PythonGenerator.yql
@@ -8,11 +8,11 @@
 (let udfScript (String '@@
 def g0():
     def gen():
-        for x in xrange(1, 4):
-            yield long(x + 42)
+        for x in range(1, 4):
+            yield int(x + 42)
     return gen
 @@))
-(let udf (ScriptUdf 'Python 'g0 udfType udfScript))
+(let udf (ScriptUdf 'Python3 'g0 udfType udfScript))
 
 # call udf
 (let result (Apply udf))
diff --git a/yql/essentials/tests/s-expressions/suites/Udf/PythonGeneratorExprWithClosure.cfg b/yql/essentials/tests/s-expressions/suites/Udf/PythonGeneratorExprWithClosure.cfg
index 0bbf3ad9f029..aa782f323cb1 100644
--- a/yql/essentials/tests/s-expressions/suites/Udf/PythonGeneratorExprWithClosure.cfg
+++ b/yql/essentials/tests/s-expressions/suites/Udf/PythonGeneratorExprWithClosure.cfg
@@ -1,4 +1,4 @@
 in Input input.txt
 out Output output.txt
 res result.txt
-udf python2_udf
+udf python3_udf
diff --git a/yql/essentials/tests/s-expressions/suites/Udf/PythonGeneratorExprWithClosure.yql b/yql/essentials/tests/s-expressions/suites/Udf/PythonGeneratorExprWithClosure.yql
index 86a1c9a0bb18..164f870ec8b1 100644
--- a/yql/essentials/tests/s-expressions/suites/Udf/PythonGeneratorExprWithClosure.yql
+++ b/yql/essentials/tests/s-expressions/suites/Udf/PythonGeneratorExprWithClosure.yql
@@ -8,10 +8,10 @@
 (let udfScript (String '@@
 def g(input):
     def gen():
-        return (long(x + 42) for x in input)
+        return (int(x + 42) for x in input)
     return gen
 @@))
-(let udf (ScriptUdf 'Python 'g udfType udfScript))
+(let udf (ScriptUdf 'Python3 'g udfType udfScript))
 
 # call udf
 (let result (Apply udf (AsList (Int64 '1) (Int64 '2) (Int64 '3))))
diff --git a/yql/essentials/tests/s-expressions/suites/Udf/PythonGeneratorWithClosure.cfg b/yql/essentials/tests/s-expressions/suites/Udf/PythonGeneratorWithClosure.cfg
index 0bbf3ad9f029..aa782f323cb1 100644
--- a/yql/essentials/tests/s-expressions/suites/Udf/PythonGeneratorWithClosure.cfg
+++ b/yql/essentials/tests/s-expressions/suites/Udf/PythonGeneratorWithClosure.cfg
@@ -1,4 +1,4 @@
 in Input input.txt
 out Output output.txt
 res result.txt
-udf python2_udf
+udf python3_udf
diff --git a/yql/essentials/tests/s-expressions/suites/Udf/PythonGeneratorWithClosure.yql b/yql/essentials/tests/s-expressions/suites/Udf/PythonGeneratorWithClosure.yql
index 87799cbf69dc..2c9a022caec3 100644
--- a/yql/essentials/tests/s-expressions/suites/Udf/PythonGeneratorWithClosure.yql
+++ b/yql/essentials/tests/s-expressions/suites/Udf/PythonGeneratorWithClosure.yql
@@ -9,10 +9,10 @@
 def h(input):
     def gen():
         for x in input:
-            yield long(x + 42)
+            yield int(x + 42)
     return gen
 @@))
-(let udf (ScriptUdf 'Python 'h udfType udfScript))
+(let udf (ScriptUdf 'Python3 'h udfType udfScript))
 
 # call udf
 (let result (Apply udf (AsList (Int64 '1) (Int64 '2) (Int64 '3))))
diff --git a/yql/essentials/tests/s-expressions/suites/Udf/PythonPartialCall.cfg b/yql/essentials/tests/s-expressions/suites/Udf/PythonPartialCall.cfg
index 0bbf3ad9f029..aa782f323cb1 100644
--- a/yql/essentials/tests/s-expressions/suites/Udf/PythonPartialCall.cfg
+++ b/yql/essentials/tests/s-expressions/suites/Udf/PythonPartialCall.cfg
@@ -1,4 +1,4 @@
 in Input input.txt
 out Output output.txt
 res result.txt
-udf python2_udf
+udf python3_udf
diff --git a/yql/essentials/tests/s-expressions/suites/Udf/PythonPartialCall.yql b/yql/essentials/tests/s-expressions/suites/Udf/PythonPartialCall.yql
index f45e2d915711..1a7f7f9dbd26 100644
--- a/yql/essentials/tests/s-expressions/suites/Udf/PythonPartialCall.yql
+++ b/yql/essentials/tests/s-expressions/suites/Udf/PythonPartialCall.yql
@@ -6,7 +6,7 @@
 (let ui32 (DataType 'Uint32))
 (let udfType (CallableType '('1) '(ui32) '(ui32) '((OptionalType ui32))))
 (let udfScript (String '"def Add(x, y): return x + (y if y is not None else 1)"))
-(let udf (ScriptUdf 'Python 'Add udfType udfScript))
+(let udf (ScriptUdf 'Python3 'Add udfType udfScript))
 
 # call udf
 (let x (Uint32 '20))
diff --git a/yql/essentials/tests/s-expressions/suites/Udf/PythonStruct.cfg b/yql/essentials/tests/s-expressions/suites/Udf/PythonStruct.cfg
index 0bbf3ad9f029..aa782f323cb1 100644
--- a/yql/essentials/tests/s-expressions/suites/Udf/PythonStruct.cfg
+++ b/yql/essentials/tests/s-expressions/suites/Udf/PythonStruct.cfg
@@ -1,4 +1,4 @@
 in Input input.txt
 out Output output.txt
 res result.txt
-udf python2_udf
+udf python3_udf
diff --git a/yql/essentials/tests/s-expressions/suites/Udf/PythonStruct.yql b/yql/essentials/tests/s-expressions/suites/Udf/PythonStruct.yql
index 38a484a4c3ca..89e0bd435d2f 100644
--- a/yql/essentials/tests/s-expressions/suites/Udf/PythonStruct.yql
+++ b/yql/essentials/tests/s-expressions/suites/Udf/PythonStruct.yql
@@ -18,7 +18,7 @@ def NewPerson(name, age):
     return Person(name, age)
 
 @@))
-(let udf (ScriptUdf 'Python 'NewPerson udfType udfScript))
+(let udf (ScriptUdf 'Python3 'NewPerson udfType udfScript))
 
 # call udf
 (let result (Apply udf (String 'Jamel) (Uint32 '99)))
diff --git a/yql/essentials/tests/s-expressions/suites/Udf/PythonSum.cfg b/yql/essentials/tests/s-expressions/suites/Udf/PythonSum.cfg
index 0bbf3ad9f029..aa782f323cb1 100644
--- a/yql/essentials/tests/s-expressions/suites/Udf/PythonSum.cfg
+++ b/yql/essentials/tests/s-expressions/suites/Udf/PythonSum.cfg
@@ -1,4 +1,4 @@
 in Input input.txt
 out Output output.txt
 res result.txt
-udf python2_udf
+udf python3_udf
diff --git a/yql/essentials/tests/s-expressions/suites/Udf/PythonSum.yql b/yql/essentials/tests/s-expressions/suites/Udf/PythonSum.yql
index b240dc0f58be..ef5f4e154935 100644
--- a/yql/essentials/tests/s-expressions/suites/Udf/PythonSum.yql
+++ b/yql/essentials/tests/s-expressions/suites/Udf/PythonSum.yql
@@ -6,7 +6,7 @@
 (let ui32 (DataType 'Uint32))
 (let udfType (CallableType '() '(ui32) '(ui32) '(ui32)))
 (let udfScript (String '"def Sum(x, y): return x + y"))
-(let udf (ScriptUdf 'Python 'Sum udfType udfScript))
+(let udf (ScriptUdf 'Python3 'Sum udfType udfScript))
 
 # call udf
 (let x (Uint32 '10))
diff --git a/yql/essentials/tests/s-expressions/suites/View/ViewWithUdfProcess.cfg b/yql/essentials/tests/s-expressions/suites/View/ViewWithUdfProcess.cfg
index b2849b4f983b..4cc66ff555ac 100644
--- a/yql/essentials/tests/s-expressions/suites/View/ViewWithUdfProcess.cfg
+++ b/yql/essentials/tests/s-expressions/suites/View/ViewWithUdfProcess.cfg
@@ -1,3 +1,3 @@
 in Input input6.txt
 res result.txt
-udf python2_udf
+udf python3_udf
diff --git a/yql/essentials/tests/s-expressions/suites/View/input6.txt.attr b/yql/essentials/tests/s-expressions/suites/View/input6.txt.attr
index 501fdca6dbaf..7f425e31eaaf 100644
--- a/yql/essentials/tests/s-expressions/suites/View/input6.txt.attr
+++ b/yql/essentials/tests/s-expressions/suites/View/input6.txt.attr
@@ -1,5 +1,5 @@
 {
-    "_yql_view_my"="$udfScript=\"def f(row):\n\treturn {'myvalue':b'm' + row.value}\n\";$udf=Python::f(Callable<(Struct<key:String,subkey:String,value:String>)->Struct<myvalue:String>>, $udfScript);PROCESS self USING $udf(TableRow());";
+    "_yql_view_my"="$udfScript=\"def f(row):\n\treturn {'myvalue':b'm' + row.value}\n\";$udf=Python3::f(Callable<(Struct<key:String,subkey:String,value:String>)->Struct<myvalue:String>>, $udfScript);PROCESS self USING $udf(TableRow());";
     "_yql_row_spec" = {
         "Type" = [
             "StructType";

From f4d9e6b05d8c72b02c0cbd2dfc4e7c1b7f86b19f Mon Sep 17 00:00:00 2001
From: aneporada <aneporada@yandex-team.com>
Date: Fri, 29 Nov 2024 23:59:52 +0300
Subject: [PATCH 09/16] Make a clone of DQ nodes in yql/essentials
 commit_hash:00e744cfe198dba16af67209723070b40f5726c8

---
 .../core/dq_expr_nodes/dq_expr_nodes.h        |  71 ++++
 .../core/dq_expr_nodes/dq_expr_nodes.json     | 316 ++++++++++++++++++
 yql/essentials/core/dq_expr_nodes/ya.make     |  53 +++
 .../core/dqs_expr_nodes/dqs_expr_nodes.h      |  10 +
 .../core/dqs_expr_nodes/dqs_expr_nodes.json   |  50 +++
 yql/essentials/core/dqs_expr_nodes/ya.make    |  53 +++
 .../core/expr_nodes_gen/gen/__main__.py       |  13 +
 .../expr_nodes_gen/yql_expr_nodes_gen.jnj     |  50 ++-
 yql/essentials/core/ya.make                   |   2 +
 9 files changed, 603 insertions(+), 15 deletions(-)
 create mode 100644 yql/essentials/core/dq_expr_nodes/dq_expr_nodes.h
 create mode 100644 yql/essentials/core/dq_expr_nodes/dq_expr_nodes.json
 create mode 100644 yql/essentials/core/dq_expr_nodes/ya.make
 create mode 100644 yql/essentials/core/dqs_expr_nodes/dqs_expr_nodes.h
 create mode 100644 yql/essentials/core/dqs_expr_nodes/dqs_expr_nodes.json
 create mode 100644 yql/essentials/core/dqs_expr_nodes/ya.make

diff --git a/yql/essentials/core/dq_expr_nodes/dq_expr_nodes.h b/yql/essentials/core/dq_expr_nodes/dq_expr_nodes.h
new file mode 100644
index 000000000000..c6a9ce8119c3
--- /dev/null
+++ b/yql/essentials/core/dq_expr_nodes/dq_expr_nodes.h
@@ -0,0 +1,71 @@
+#pragma once
+
+#include <yql/essentials/core/dq_expr_nodes/dq_expr_nodes.gen.h>
+
+#include <yql/essentials/core/expr_nodes/yql_expr_nodes.h>
+
+namespace NYql::NNodes {
+
+#include <yql/essentials/core/dq_expr_nodes/dq_expr_nodes.decl.inl.h>
+
+namespace NDq {
+struct TTopSortSettings {
+    static inline const TString AscendingSort = "Asc";
+    static inline const TString DescendingSort = "Desc";
+};
+
+class TDqConnection : public NGenerated::TDqConnectionStub<TExprBase, TCallable, TDqOutput> {
+public:
+    explicit TDqConnection(const TExprNode* node)
+        : TDqConnectionStub(node) {}
+
+    explicit TDqConnection(const TExprNode::TPtr& node)
+        : TDqConnectionStub(node) {}
+
+    static bool Match(const TExprNode* node) {
+        if (!node) {
+            return false;
+        }
+
+        if (!node->IsCallable()) {
+            return false;
+        }
+
+        if (node->ChildrenSize() < 1) {
+            return false;
+        }
+
+        return TDqOutput::Match(node->Child(0));
+    }
+};
+
+class TDqOutputAnnotationBase : public NGenerated::TDqOutputAnnotationBaseStub<TExprBase, TCallable, TCoAtom> {
+public:
+    explicit TDqOutputAnnotationBase(const TExprNode* node)
+        : TDqOutputAnnotationBaseStub(node) {}
+
+    explicit TDqOutputAnnotationBase(const TExprNode::TPtr& node)
+        : TDqOutputAnnotationBaseStub(node) {}
+
+    static bool Match(const TExprNode* node) {
+        if (!node) {
+            return false;
+        }
+
+        if (!node->IsCallable()) {
+            return false;
+        }
+
+        if (node->ChildrenSize() < 2) {
+            return false;
+        }
+
+        return TCoAtom::Match(node->Child(0))
+            && TCallable::Match(node->Child(1));
+    }
+};
+} // namespace NDq
+
+#include <yql/essentials/core/dq_expr_nodes/dq_expr_nodes.defs.inl.h>
+
+} // namespace NYql::NNodes
diff --git a/yql/essentials/core/dq_expr_nodes/dq_expr_nodes.json b/yql/essentials/core/dq_expr_nodes/dq_expr_nodes.json
new file mode 100644
index 000000000000..e9efb8c20423
--- /dev/null
+++ b/yql/essentials/core/dq_expr_nodes/dq_expr_nodes.json
@@ -0,0 +1,316 @@
+{
+    "NodeRootType": "TExprBase",
+    "NodeBuilderBase": "TNodeBuilderBase",
+    "ListBuilderBase": "TListBuilderBase",
+    "FreeArgCallableBase": "TFreeArgCallable",
+    "FreeArgBuilderBase": "TFreeArgCallableBuilderBase",
+    "Nodes": [
+    {
+        "Name": "TDqJoinKeyTuple",
+        "Base": "TExprBase",
+        "Match": {"Type": "Tuple"},
+        "Children": [
+            {"Index": 0, "Name": "LeftLabel", "Type": "TCoAtom"},
+            {"Index": 1, "Name": "LeftColumn", "Type": "TCoAtom"},
+            {"Index": 2, "Name": "RightLabel", "Type": "TCoAtom"},
+            {"Index": 3, "Name": "RightColumn", "Type": "TCoAtom"}
+        ]
+    },
+    {
+        "Name": "TDqJoinKeyTupleList",
+        "ListBase": "TDqJoinKeyTuple"
+    },
+    {
+        "Name": "TDqJoinBase",
+        "Base": "TCallable",
+        "Match": {"Type": "CallableBase"},
+        "Builder": {"Generate": "None"},
+        "Children": [
+            {"Index": 0, "Name": "LeftInput", "Type": "TExprBase"},
+            {"Index": 2, "Name": "LeftLabel", "Type": "TExprBase",
+                "NB": "Atom - if left is real table, Void - otherwise"},
+            {"Index": 1, "Name": "RightInput", "Type": "TExprBase"},
+            {"Index": 3, "Name": "RightLabel", "Type": "TExprBase",
+                "NB": "Atom - if right is real table, Void - otherwise"},
+            {"Index": 4, "Name": "JoinType", "Type": "TCoAtom"},
+            {"Index": 5, "Name": "JoinKeys", "Type": "TDqJoinKeyTupleList"},
+            {"Index": 6, "Name": "LeftJoinKeyNames", "Type": "TCoAtomList"},
+            {"Index": 7, "Name": "RightJoinKeyNames", "Type": "TCoAtomList"}
+        ]
+    },
+    {
+        "Name": "TDqJoin",
+        "Base": "TDqJoinBase",
+        "Match": {"Type": "Callable", "Name": "DqJoin"},
+        "Children": [
+            {"Index": 8, "Name": "JoinAlgo", "Type": "TCoAtom"},
+            {"Index": 9, "Name": "Flags", "Type": "TCoAtomList", "Optional": true}
+        ]
+    },
+    {
+        "Name": "TDqPhyGraceJoin",
+        "Base": "TDqJoinBase",
+        "Match": {"Type": "Callable", "Name": "DqPhyGraceJoin"},
+        "Children": [
+            {"Index": 8, "Name": "Flags", "Type": "TCoAtomList", "Optional": true}
+        ]
+    },
+    {
+        "Name": "TDqPhyMapJoin",
+        "Base": "TDqJoinBase",
+        "Match": {"Type": "Callable", "Name": "DqPhyMapJoin"}
+    },
+    {
+        "Name": "TDqPhyCrossJoin",
+        "Base": "TDqJoinBase",
+        "Match": {"Type": "Callable", "Name": "DqPhyCrossJoin"}
+    },
+    {
+        "Name": "TDqPhyJoinDict",
+        "Base": "TDqJoinBase",
+        "Match": {"Type": "Callable", "Name": "DqPhyJoinDict"}
+    },
+    {
+        "Name": "TDqSource",
+        "Base": "TCallable",
+        "Match": {"Type": "Callable", "Name": "DqSource"},
+        "Children": [
+            {"Index": 0, "Name": "DataSource", "Type": "TCallable"},
+            {"Index": 1, "Name": "Settings", "Type": "TExprBase"}
+        ]
+    },
+    {
+        "Name": "TDqOutputAnnotationBase",
+        "Base": "TCallable",
+        "Definition": "Custom",
+        "Builder": {"Generate": "None"},
+        "Children": [
+            {"Index": 0, "Name": "Index", "Type": "TCoAtom"},
+            {"Index": 1, "Name": "DataSink", "Type": "TCallable"}
+        ]
+    },
+    {
+        "Name": "TDqTransform",
+        "Base": "TDqOutputAnnotationBase",
+        "Match": {"Type": "Callable", "Name": "DqTransform"},
+        "Children": [
+            {"Index": 2, "Name": "Type", "Type": "TCoAtom"},
+            {"Index": 3, "Name": "InputType", "Type": "TExprBase"},
+            {"Index": 4, "Name": "OutputType", "Type": "TExprBase"},
+            {"Index": 5, "Name": "Settings", "Type": "TCallable"}
+        ]
+    },
+    {
+        "Name": "TDqSink",
+        "Base": "TDqOutputAnnotationBase",
+        "Match": {"Type": "Callable", "Name": "DqSink"},
+        "Children": [
+            {"Index": 2, "Name": "Settings", "Type": "TCallable"}
+        ]
+    },
+    {
+        "Name": "TDqStageOutputsList",
+        "ListBase": "TDqOutputAnnotationBase"
+    },
+    {
+        "Name": "TDqStageBase",
+        "Base": "TCallable",
+        "Match": {"Type": "CallableBase"},
+        "Builder": {"Generate": "None"},
+        "Children": [
+            {"Index": 0, "Name": "Inputs", "Type": "TExprList"},
+            {"Index": 1, "Name": "Program", "Type": "TCoLambda"},
+            {"Index": 2, "Name": "Settings", "Type": "TCoNameValueTupleList"},
+            {"Index": 3, "Name": "Outputs", "Type": "TDqStageOutputsList", "Optional": true}
+        ]
+    },
+    {
+        "Name": "TDqStage",
+        "Base": "TDqStageBase",
+        "Match": {"Type": "Callable", "Name": "DqStage"}
+    },
+    {
+        "Name": "TDqPhyStage",
+        "Base": "TDqStageBase",
+        "Match": {"Type": "Callable", "Name": "DqPhyStage"}
+    },
+    {
+        "Name": "TDqStageList",
+        "ListBase": "TDqStageBase"
+    },
+    {
+        "Name": "TDqPhyStageList",
+        "ListBase": "TDqPhyStage"
+    },
+    {
+        "Name": "TDqOutput",
+        "Base": "TCallable",
+        "Match": {"Type": "Callable", "Name": "TDqOutput"},
+        "Children": [
+            {"Index": 0, "Name": "Stage", "Type": "TDqStageBase"},
+            {"Index": 1, "Name": "Index", "Type": "TCoAtom"}
+        ]
+    },
+    {
+        "Name": "TDqConnection",
+        "Base": "TCallable",
+        "Definition": "Custom",
+        "Builder": {"Generate": "None"},
+        "Children": [
+            {"Index": 0, "Name": "Output", "Type": "TDqOutput"}
+        ]
+    },
+    {
+        "Name": "TDqCnHashShuffle",
+        "Base": "TDqConnection",
+        "Match": {"Type": "Callable", "Name": "DqCnHashShuffle"},
+        "Children": [
+            {"Index": 1, "Name": "KeyColumns", "Type": "TCoAtomList"}
+        ]
+    },
+    {
+        "Name": "TDqCnBroadcast",
+        "Base": "TDqConnection",
+        "Match": {"Type": "Callable", "Name": "DqCnBroadcast"}
+    },
+    {
+        "Name": "TDqCnUnionAll",
+        "Base": "TDqConnection",
+        "Match": {"Type": "Callable", "Name": "DqCnUnionAll"}
+    },
+    {
+        "Name": "TDqCnMap",
+        "Base": "TDqConnection",
+        "Match": {"Type": "Callable", "Name": "DqCnMap"}
+    },
+    {
+        "Name": "TDqCnStreamLookup",
+        "Base": "TDqConnection",
+        "Match": {"Type": "Callable", "Name": "DqCnStreamLookup"},
+        "Children": [
+            {"Index": 1, "Name": "LeftLabel", "Type": "TCoAtom"},
+            {"Index": 2, "Name": "RightInput", "Type": "TExprBase"},
+            {"Index": 3, "Name": "RightLabel", "Type": "TCoAtom"},
+            {"Index": 4, "Name": "JoinType", "Type": "TCoAtom"},
+            {"Index": 5, "Name": "JoinKeys", "Type": "TDqJoinKeyTupleList"},
+            {"Index": 6, "Name": "LeftJoinKeyNames", "Type": "TCoAtomList"},
+            {"Index": 7, "Name": "RightJoinKeyNames", "Type": "TCoAtomList"},
+            {"Index": 8, "Name": "TTL", "Type": "TCoAtom"},
+            {"Index": 9, "Name": "MaxDelayedRows", "Type": "TCoAtom"},
+            {"Index": 10, "Name": "MaxCachedRows", "Type": "TCoAtom"}
+        ]
+    },
+    {
+        "Name": "TDqCnResult",
+        "Base": "TDqConnection",
+        "Match": {"Type": "Callable", "Name": "DqCnResult"},
+        "Children": [
+            {"Index": 1, "Name": "ColumnHints", "Type": "TCoAtomList"}
+        ]
+    },
+    {
+        "Name": "TDqCnValue",
+        "Base": "TDqConnection",
+        "Match": {"Type": "Callable", "Name": "DqCnValue"}
+    },
+    {
+        "Name": "TDqSortColumn",
+        "Base": "TExprBase",
+        "Match": {"Type": "Tuple"},
+        "Children": [
+            {"Index": 0, "Name": "Column", "Type": "TCoAtom"},
+            {"Index": 1, "Name": "SortDirection", "Type": "TCoAtom"}
+        ]
+    },
+    {
+        "Name": "TDqSortColumnList",
+        "ListBase": "TDqSortColumn"
+    },
+    {
+        "Name": "TDqCnMerge",
+        "Base": "TDqConnection",
+        "Match": {"Type": "Callable", "Name": "DqCnMerge"},
+        "Children": [
+            {"Index": 1, "Name": "SortColumns", "Type": "TDqSortColumnList"}
+        ]
+    },
+    {
+        "Name": "TDqReplicate",
+        "Base": "TFreeArgCallable",
+        "Match": {"Type": "Callable", "Name": "DqReplicate"},
+        "Children": [
+            {"Index": 0, "Name": "Input", "Type": "TExprBase"}
+        ]
+    },
+    {
+        "Name": "TDqQuery",
+        "Base": "TCallable",
+        "Match": {"Type": "Callable", "Name": "DqQuery!"},
+        "Children": [
+            {"Index": 0, "Name": "World", "Type": "TExprBase"},
+            {"Index": 1, "Name": "SinkStages", "Type": "TDqStageList"}
+        ]
+    },
+    {
+        "Name": "TDqPrecompute",
+        "Base": "TCallable",
+        "Match": {"Type": "Callable", "Name": "DqPrecompute"},
+        "Children": [
+            {"Index": 0, "Name": "Input", "Type": "TExprBase"}
+        ]
+    },
+    {
+        "Name": "TDqPhyPrecompute",
+        "Base": "TCallable",
+        "Match": {"Type": "Callable", "Name": "DqPhyPrecompute"},
+        "Children": [
+            {"Index": 0, "Name": "Connection", "Type": "TDqConnection"}
+        ]
+    },
+    {
+        "Name": "TDqSqlExternalFunction",
+        "Base": "TCallable",
+        "Match": {"Type": "Callable", "Name": "SqlExternalFunction"},
+        "Children": [
+            {"Index": 0, "Name": "TransformType", "Type": "TExprBase"},
+            {"Index": 1, "Name": "TransformName", "Type": "TExprBase"},
+            {"Index": 2, "Name": "Settings", "Type": "TCoNameValueTupleList"}
+        ]
+    },
+    {
+        "Name": "TDqPhyLength",
+        "Base": "TCallable",
+        "Match": {"Type": "Callable", "Name": "DqPhyLength"},
+        "Children": [
+            {"Index": 0, "Name": "Input", "Type": "TExprBase"},
+            {"Index": 1, "Name": "Name", "Type": "TCoAtom"}
+        ]
+    },
+    {
+      "Name": "TDqReadWrapBase",
+      "Base": "TExprBase",
+      "Match": {"Type": "CallableBase"},
+      "Builder": {"Generate": "None"},
+      "Children": [
+        {"Index": 0, "Name": "Input", "Type": "TExprBase"},
+        {"Index": 1, "Name": "Flags", "Type": "TCoAtomList"},
+        {"Index": 2, "Name": "Token", "Type": "TCoSecureParam", "Optional": true}
+      ]
+    },
+    {
+      "Name": "TDqReadWrap",
+      "Base": "TDqReadWrapBase",
+      "Match": {"Type": "Callable", "Name": "DqReadWrap"}
+    },
+    {
+      "Name": "TDqReadWideWrap",
+      "Base": "TDqReadWrapBase",
+      "Match": {"Type": "Callable", "Name": "DqReadWideWrap"}
+    },
+    {
+      "Name": "TDqReadBlockWideWrap",
+      "Base": "TDqReadWrapBase",
+      "Match": {"Type": "Callable", "Name": "DqReadBlockWideWrap"}
+    }
+    ]
+}
diff --git a/yql/essentials/core/dq_expr_nodes/ya.make b/yql/essentials/core/dq_expr_nodes/ya.make
new file mode 100644
index 000000000000..e1acd2003eca
--- /dev/null
+++ b/yql/essentials/core/dq_expr_nodes/ya.make
@@ -0,0 +1,53 @@
+LIBRARY()
+
+SRCS(
+    dq_expr_nodes.h
+)
+
+PEERDIR(
+    yql/essentials/core/expr_nodes
+)
+
+SRCDIR(
+    yql/essentials/core/expr_nodes_gen
+)
+
+IF(EXPORT_CMAKE)
+    RUN_PYTHON3(
+        ${ARCADIA_ROOT}/yql/essentials/core/expr_nodes_gen/gen/__main__.py
+        yql_expr_nodes_gen.jnj
+        dq_expr_nodes.json
+        dq_expr_nodes.gen.h
+        dq_expr_nodes.decl.inl.h
+        dq_expr_nodes.defs.inl.h
+        NDq
+        IN yql_expr_nodes_gen.jnj
+        IN dq_expr_nodes.json
+        OUT dq_expr_nodes.gen.h
+        OUT dq_expr_nodes.decl.inl.h
+        OUT dq_expr_nodes.defs.inl.h
+        OUTPUT_INCLUDES
+        ${ARCADIA_ROOT}/yql/essentials/core/expr_nodes_gen/yql_expr_nodes_gen.h
+        ${ARCADIA_ROOT}/util/generic/hash_set.h
+    )
+ELSE()
+    RUN_PROGRAM(
+        yql/essentials/core/expr_nodes_gen/gen
+        yql_expr_nodes_gen.jnj
+        dq_expr_nodes.json
+        dq_expr_nodes.gen.h
+        dq_expr_nodes.decl.inl.h
+        dq_expr_nodes.defs.inl.h
+        NDq
+        IN yql_expr_nodes_gen.jnj
+        IN dq_expr_nodes.json
+        OUT dq_expr_nodes.gen.h
+        OUT dq_expr_nodes.decl.inl.h
+        OUT dq_expr_nodes.defs.inl.h
+        OUTPUT_INCLUDES
+        ${ARCADIA_ROOT}/yql/essentials/core/expr_nodes_gen/yql_expr_nodes_gen.h
+        ${ARCADIA_ROOT}/util/generic/hash_set.h
+    )
+ENDIF()
+
+END()
diff --git a/yql/essentials/core/dqs_expr_nodes/dqs_expr_nodes.h b/yql/essentials/core/dqs_expr_nodes/dqs_expr_nodes.h
new file mode 100644
index 000000000000..8851127f8315
--- /dev/null
+++ b/yql/essentials/core/dqs_expr_nodes/dqs_expr_nodes.h
@@ -0,0 +1,10 @@
+#pragma once
+
+#include <yql/essentials/core/dqs_expr_nodes/dqs_expr_nodes.gen.h>
+
+#include <yql/essentials/core/expr_nodes/yql_expr_nodes.h>
+
+namespace NYql::NNodes {
+#include <yql/essentials/core/dqs_expr_nodes/dqs_expr_nodes.decl.inl.h>
+#include <yql/essentials/core/dqs_expr_nodes/dqs_expr_nodes.defs.inl.h>
+}
diff --git a/yql/essentials/core/dqs_expr_nodes/dqs_expr_nodes.json b/yql/essentials/core/dqs_expr_nodes/dqs_expr_nodes.json
new file mode 100644
index 000000000000..7ee283859f28
--- /dev/null
+++ b/yql/essentials/core/dqs_expr_nodes/dqs_expr_nodes.json
@@ -0,0 +1,50 @@
+{
+  "NodeRootType": "TExprBase",
+  "NodeBuilderBase": "TNodeBuilderBase",
+  "ListBuilderBase": "TListBuilderBase",
+  "FreeArgCallableBase": "TFreeArgCallable",
+  "FreeArgBuilderBase": "TFreeArgCallableBuilderBase",
+  "Nodes": [
+    {
+      "Name": "TDqWrite",
+      "Base": "TCallable",
+      "Match": {"Type": "Callable", "Name": "DqWrite"},
+      "Children": [
+        {"Index": 0, "Name": "Input", "Type": "TExprBase"},
+        {"Index": 1, "Name": "Provider", "Type": "TCoAtom"},
+        {"Index": 2, "Name": "Settings", "Type": "TCoNameValueTupleList", "Optional": true}
+      ]
+    },
+    {
+      "Name": "TDqSourceWrapBase",
+      "Base": "TExprBase",
+      "Match": {"Type": "CallableBase"},
+      "Children": [
+        {"Index": 0, "Name": "Input", "Type": "TExprBase"},
+        {"Index": 1, "Name": "DataSource", "Type": "TCoDataSource"},
+        {"Index": 2, "Name": "RowType", "Type": "TExprBase"},
+        {"Index": 3, "Name": "Settings", "Type": "TExprBase", "Optional": true}
+      ]
+    },
+    {
+      "Name": "TDqSourceWrap",
+      "Base": "TDqSourceWrapBase",
+      "Match": {"Type": "Callable", "Name": "DqSourceWrap"}
+    },
+    {
+      "Name": "TDqSourceWideWrap",
+      "Base": "TDqSourceWrapBase",
+      "Match": {"Type": "Callable", "Name": "DqSourceWideWrap"}
+    },
+    {
+      "Name": "TDqSourceWideBlockWrap",
+      "Base": "TDqSourceWrapBase",
+      "Match": {"Type": "Callable", "Name": "DqSourceWideBlockWrap"}
+    },
+    {
+      "Name": "TDqLookupSourceWrap",
+      "Base": "TDqSourceWrapBase",
+      "Match": {"Type": "Callable", "Name": "TDqLookupSourceWrap"}
+    }
+  ]
+}
diff --git a/yql/essentials/core/dqs_expr_nodes/ya.make b/yql/essentials/core/dqs_expr_nodes/ya.make
new file mode 100644
index 000000000000..35519a2d7127
--- /dev/null
+++ b/yql/essentials/core/dqs_expr_nodes/ya.make
@@ -0,0 +1,53 @@
+LIBRARY()
+
+SRCS(
+    dqs_expr_nodes.h
+)
+
+PEERDIR(
+    yql/essentials/core/expr_nodes
+)
+
+SRCDIR(
+    yql/essentials/core/expr_nodes_gen
+)
+
+IF(EXPORT_CMAKE)
+    RUN_PYTHON3(
+        ${ARCADIA_ROOT}/yql/essentials/core/expr_nodes_gen/gen/__main__.py
+        yql_expr_nodes_gen.jnj
+        dqs_expr_nodes.json
+        dqs_expr_nodes.gen.h
+        dqs_expr_nodes.decl.inl.h
+        dqs_expr_nodes.defs.inl.h
+        NDq
+        IN yql_expr_nodes_gen.jnj
+        IN dqs_expr_nodes.json
+        OUT dqs_expr_nodes.gen.h
+        OUT dqs_expr_nodes.decl.inl.h
+        OUT dqs_expr_nodes.defs.inl.h
+        OUTPUT_INCLUDES
+        ${ARCADIA_ROOT}/yql/essentials/core/expr_nodes_gen/yql_expr_nodes_gen.h
+        ${ARCADIA_ROOT}/util/generic/hash_set.h
+    )
+ELSE()
+    RUN_PROGRAM(
+        yql/essentials/core/expr_nodes_gen/gen
+        yql_expr_nodes_gen.jnj
+        dqs_expr_nodes.json
+        dqs_expr_nodes.gen.h
+        dqs_expr_nodes.decl.inl.h
+        dqs_expr_nodes.defs.inl.h
+        NDq
+        IN yql_expr_nodes_gen.jnj
+        IN dqs_expr_nodes.json
+        OUT dqs_expr_nodes.gen.h
+        OUT dqs_expr_nodes.decl.inl.h
+        OUT dqs_expr_nodes.defs.inl.h
+        OUTPUT_INCLUDES
+        ${ARCADIA_ROOT}/yql/essentials/core/expr_nodes_gen/yql_expr_nodes_gen.h
+        ${ARCADIA_ROOT}/util/generic/hash_set.h
+    )
+ENDIF()
+
+END()
diff --git a/yql/essentials/core/expr_nodes_gen/gen/__main__.py b/yql/essentials/core/expr_nodes_gen/gen/__main__.py
index b381e64b5d0e..39e182e1943f 100755
--- a/yql/essentials/core/expr_nodes_gen/gen/__main__.py
+++ b/yql/essentials/core/expr_nodes_gen/gen/__main__.py
@@ -12,8 +12,13 @@
 headerOutFile = sys.argv[3]
 declOutFile = sys.argv[4]
 defsOutFile = sys.argv[5]
+nodeNspace = sys.argv[6] if len(sys.argv) >= 7 else ""
+nodeNspacePrefix = nodeNspace + "::" if nodeNspace else ""
+
+os.environ['NODES_NAMESPACE'] = nodeNspace
 
 env = Environment(loader=FileSystemLoader(templateDir))
+env.globals['ENV'] = os.getenv
 template = env.get_template(templateFilename)
 
 json_data = open(jsonFile)
@@ -24,8 +29,10 @@
 for node in model["Nodes"]:
     aux = node["aux"] = {}
 
+    aux["qName"] = nodeNspacePrefix + node["Name"]
     aux["stubName"] = node["Name"] + "Stub"
     aux["stubMaybeName"] = node["Name"] + "MaybeStub"
+    aux["qStubMaybeName"] = nodeNspacePrefix + node["Name"] + "MaybeStub"
     aux["stubBuilderName"] = node["Name"] + "BuilderStub"
     aux["stubBuilderAliasName"] = node["Name"] + "Builder"
 
@@ -109,6 +116,11 @@ def isListBuilder(node):
 
     aux["generateBuilderStub"] = node["Builder"]["Generate"] != "None" and node["Builder"]["Kind"] != "List"
     aux["generateBuilder"] = node["Builder"]["Generate"] == "Auto"
+    if isListBuilder(node):
+        if node["Builder"]["ListItemType"] in nodesMap:
+            aux["qListItemType"] = nodeNspacePrefix + node["Builder"]["ListItemType"]
+        else:
+            aux["qListItemType"] = node["Builder"]["ListItemType"]
 
     # Get all children
     allChildren = []
@@ -174,6 +186,7 @@ def addUsages(typename):
         for child in node["Children"]:
             addUsages(child["Type"])
     aux["usages"] = usages
+    aux["qUsages"] = [name if name not in nodesMap else nodeNspacePrefix + name for name in usages]
     aux["typenames"] = declarations
 
     usages = []
diff --git a/yql/essentials/core/expr_nodes_gen/yql_expr_nodes_gen.jnj b/yql/essentials/core/expr_nodes_gen/yql_expr_nodes_gen.jnj
index 94225db9ea2a..7d074a070d65 100644
--- a/yql/essentials/core/expr_nodes_gen/yql_expr_nodes_gen.jnj
+++ b/yql/essentials/core/expr_nodes_gen/yql_expr_nodes_gen.jnj
@@ -1,5 +1,17 @@
 // Auto-generated by {{ generator }}, do not modify.
 
+{% set nNspace = ENV("NODES_NAMESPACE") %}
+
+{%- if nNspace != "" %}
+
+{% set nPrefix = nNspace + "::" %}
+{% set nNspaceOpen = "namespace " +  nNspace + " {" %}
+{% set nNspaceClose = "} // namespace " +  nNspace %}
+
+{% else %}
+{% set nPrefix, nNspaceOpen, nNspaceClose = "", "", "" %}
+{%- endif %}
+
 {% if genType == "Header" %}
 #pragma once
 
@@ -8,6 +20,7 @@
 
 namespace NYql {
 namespace NNodes {
+{{ nNspaceOpen }}
 namespace NGenerated {
 
 {% for node in nodes -%}
@@ -243,10 +256,12 @@ public:
 {% endfor %}
 
 } // namespace NGenerated
+{{ nNspaceClose }}
 } // namespace NNodes
 } // namespace NYql
 {% elif genType == "Declarations" -%}
 
+{{ nNspaceOpen }}
 {% for node in nodes %}
     {%- if node.Definition == "Custom" %}
 class {{ node.Name }};
@@ -254,28 +269,32 @@ class {{ node.Name }};
 using {{ node.Name }} = NGenerated::{{ node.aux.stubName }}<{{ node.aux.usages | join(", ") }}>;
     {% endif %}
 {%- endfor %}
+{{ nNspaceClose }}
 
 {% elif genType == "Definitions" -%}
 
+{{ nNspaceOpen }}
 {% for node in nodes %}
 static_assert(std::is_constructible<{{ node.Name }}, const TExprNode*>::value,
     "{{ node.Name }} isn't defined correctly.");
 static_assert(std::is_constructible<{{ node.Name }}, const TExprNode::TPtr&>::value,
     "{{ node.Name }} isn't defined correctly.");
 {% endfor %}
+{{ nNspaceClose }}
 
 {% for node in nodes %}
 template<>
-class TMaybeNode<{{ node.Name }}> : public NGenerated::
-    {{ node.aux.stubMaybeName }}<TMaybeNode, {{ node.Name }}, {{ node.aux.usages | join(", ") }}>
+class TMaybeNode<{{ node.aux.qName }}> : public {{ nPrefix }}NGenerated::
+    {{ node.aux.stubMaybeName }}<TMaybeNode, {{ node.aux.qName }}, {{ node.aux.qUsages | join(", ") }}>
 {
 public:
     TMaybeNode(const TExprNode* node = nullptr) : {{ node.aux.stubMaybeName }}(node) {}
     TMaybeNode(const TExprNode::TPtr& node) : {{ node.aux.stubMaybeName }}(node) {}
-    TMaybeNode(const {{ node.Name }}& node) : {{ node.aux.stubMaybeName }}(node.template Maybe<{{ node.Name }}>()) {}
+    TMaybeNode(const {{ node.aux.qName }}& node) : {{ node.aux.stubMaybeName }}(node.template Maybe<{{ node.aux.qName }}>()) {}
 };
 {% endfor %}
 
+{{ nNspaceOpen }}
 namespace NGenerated {
 {% for node in nodes %}
     {%- if node.aux.generateBuilderStub %}
@@ -293,34 +312,35 @@ using {{ node.aux.stubBuilderAliasName }} =
     {% endif %}
 {%- endfor %}
 } // namespace NGenerated
+{{ nNspaceClose }}
 
 {% for node in nodes %}
     {%- if node.aux.generateBuilder %}
         {% if node.Builder.Kind == "Node" or node.Builder.Kind == "FreeArg" %}
 template<typename TParent>
-class TNodeBuilder<TParent, {{ node.Name }}> : public NGenerated::{{ node.aux.stubBuilderAliasName }}<TParent>
+class TNodeBuilder<TParent, {{ node.aux.qName }}> : public {{ nPrefix }}NGenerated::{{ node.aux.stubBuilderAliasName }}<TParent>
 {
 private:
-    typedef typename NGenerated::{{ node.aux.stubBuilderAliasName }}<TParent>::BuildFuncType BuildFuncType;
-    typedef typename NGenerated::{{ node.aux.stubBuilderAliasName }}<TParent>::GetArgFuncType GetArgFuncType;
+    typedef typename {{ nPrefix }}NGenerated::{{ node.aux.stubBuilderAliasName }}<TParent>::BuildFuncType BuildFuncType;
+    typedef typename {{ nPrefix }}NGenerated::{{ node.aux.stubBuilderAliasName }}<TParent>::GetArgFuncType GetArgFuncType;
 public:
     TNodeBuilder(TExprContext& ctx, TPositionHandle pos, BuildFuncType buildFunc, GetArgFuncType getArgFunc)
-        : NGenerated::{{ node.aux.stubBuilderAliasName }}<TParent>(ctx, pos, buildFunc, getArgFunc) {}
+        : {{ nPrefix }}NGenerated::{{ node.aux.stubBuilderAliasName }}<TParent>(ctx, pos, buildFunc, getArgFunc) {}
         {% elif node.Builder.Kind == "List" %}
 template<typename TParent>
-class TNodeBuilder<TParent, {{ node.Name }}> : public TListBuilderBase<TParent, {{ node.Name }}, {{ node.Builder.ListItemType }}>
+class TNodeBuilder<TParent, {{ node.aux.qName }}> : public TListBuilderBase<TParent, {{ node.aux.qName }}, {{ node.aux.qListItemType }}>
 {
 private:
-    typedef typename TListBuilderBase<TParent, {{ node.Name }}, {{ node.Builder.ListItemType }}>::BuildFuncType BuildFuncType;
-    typedef typename TListBuilderBase<TParent, {{ node.Name }}, {{ node.Builder.ListItemType }}>::GetArgFuncType GetArgFuncType;
+    typedef typename TListBuilderBase<TParent, {{ node.aux.qName }}, {{ node.aux.qListItemType }}>::BuildFuncType BuildFuncType;
+    typedef typename TListBuilderBase<TParent, {{ node.aux.qName }}, {{ node.aux.qListItemType }}>::GetArgFuncType GetArgFuncType;
 public:
     TNodeBuilder(TExprContext& ctx, TPositionHandle pos, BuildFuncType buildFunc, GetArgFuncType getArgFunc)
-        : TListBuilderBase<TParent, {{ node.Name }}, {{ node.Builder.ListItemType }}>(ctx, pos, buildFunc, getArgFunc) {}
+        : TListBuilderBase<TParent, {{ node.aux.qName }}, {{ node.aux.qListItemType }}>(ctx, pos, buildFunc, getArgFunc) {}
         {% else %}
 class UnknownBuilder {
     static_assert(false, "Unknown builder kind: {{ node.Builder.Kind }}.");
         {% endif %}
-    {{ node.Name }} DoBuild() {
+    {{ node.aux.qName }} DoBuild() {
         {%- if node.Match and (node.Match.Type == "Callable" or node.Match.Type == "CallableBase") %}
         TExprNode::TListType argsList;
             {%- if node.Builder.Kind == "List" %}
@@ -347,7 +367,7 @@ class UnknownBuilder {
             {%- else %}
         auto node = this->Ctx.NewCallable(this->Pos, "{{ node.Match.Name }}", std::move(argsList));
             {%- endif %}
-        return {{ node.Name }}(node);
+        return {{ node.aux.qName }}(node);
         {%- elif node.Match and node.Match.Type == "Tuple"%}
         TExprNode::TListType tupleItems;
             {%- for child in node.aux.allChildren %}
@@ -357,14 +377,14 @@ class UnknownBuilder {
         }
             {%- endfor %}
         auto node = this->Ctx.NewList(this->Pos, std::move(tupleItems));
-        return {{ node.Name }}(node);
+        return {{ node.aux.qName }}(node);
         {%- elif node.ListBase and node.Builder.Kind == "List"%}
         TExprNode::TListType nodeChildren;
         for (auto child : this->Items) {
             nodeChildren.push_back(child.Ptr());
         }
         auto node = this->Ctx.NewList(this->Pos, std::move(nodeChildren));
-        return {{ node.Name }}(node);
+        return {{ node.aux.qName }}(node);
         {%- else %}
         static_assert(false, "Don't know how to build {{ node.Name }}.");
         {% endif %}
diff --git a/yql/essentials/core/ya.make b/yql/essentials/core/ya.make
index 2e253709c291..100103b92fca 100644
--- a/yql/essentials/core/ya.make
+++ b/yql/essentials/core/ya.make
@@ -98,7 +98,9 @@ END()
 RECURSE(
     cbo
     credentials
+    dq_expr_nodes
     dq_integration
+    dqs_expr_nodes
     file_storage
     issue
     minsketch

From df64d4599e151698c173343d73e12328aa9ce1ef Mon Sep 17 00:00:00 2001
From: akhropov <akhropov@yandex-team.com>
Date: Sat, 30 Nov 2024 03:24:07 +0300
Subject: [PATCH 10/16] More functions marked as noexcept.
 commit_hash:61b2ca8fab66696aade476173e07001d1b886b5e

---
 util/generic/array_ref.h | 14 +++++++-------
 1 file changed, 7 insertions(+), 7 deletions(-)

diff --git a/util/generic/array_ref.h b/util/generic/array_ref.h
index fb9ab0bcf13a..2e62071be80a 100644
--- a/util/generic/array_ref.h
+++ b/util/generic/array_ref.h
@@ -209,7 +209,7 @@ class TArrayRef {
      *
      * DEPRECATED. DO NOT USE.
      */
-    TArrayRef SubRegion(size_t offset, size_t size) const {
+    TArrayRef SubRegion(size_t offset, size_t size) const noexcept {
         if (size == 0 || offset >= S_) {
             return TArrayRef();
         }
@@ -255,31 +255,31 @@ TArrayRef<char> as_writable_bytes(TArrayRef<T> arrayRef Y_LIFETIME_BOUND) noexce
 }
 
 template <class Range>
-constexpr TArrayRef<const typename Range::value_type> MakeArrayRef(const Range& range) {
+constexpr TArrayRef<const typename Range::value_type> MakeArrayRef(const Range& range) noexcept {
     return TArrayRef<const typename Range::value_type>(range);
 }
 
 template <class Range>
-constexpr TArrayRef<typename Range::value_type> MakeArrayRef(Range& range) {
+constexpr TArrayRef<typename Range::value_type> MakeArrayRef(Range& range) noexcept {
     return TArrayRef<typename Range::value_type>(range);
 }
 
 template <class Range>
-constexpr TArrayRef<const typename Range::value_type> MakeConstArrayRef(const Range& range) {
+constexpr TArrayRef<const typename Range::value_type> MakeConstArrayRef(const Range& range) noexcept {
     return TArrayRef<const typename Range::value_type>(range);
 }
 
 template <class Range>
-constexpr TArrayRef<const typename Range::value_type> MakeConstArrayRef(Range& range) {
+constexpr TArrayRef<const typename Range::value_type> MakeConstArrayRef(Range& range) noexcept {
     return TArrayRef<const typename Range::value_type>(range);
 }
 
 template <class T>
-constexpr TArrayRef<T> MakeArrayRef(T* data Y_LIFETIME_BOUND, size_t size) {
+constexpr TArrayRef<T> MakeArrayRef(T* data Y_LIFETIME_BOUND, size_t size) noexcept {
     return TArrayRef<T>(data, size);
 }
 
 template <class T>
-constexpr TArrayRef<T> MakeArrayRef(T* begin Y_LIFETIME_BOUND, T* end Y_LIFETIME_BOUND) {
+constexpr TArrayRef<T> MakeArrayRef(T* begin Y_LIFETIME_BOUND, T* end Y_LIFETIME_BOUND) noexcept {
     return TArrayRef<T>(begin, end);
 }

From 9dc6b2fddb37a98c860a013c641c112d5030c7a7 Mon Sep 17 00:00:00 2001
From: robot-piglet <robot-piglet@yandex-team.com>
Date: Sat, 30 Nov 2024 12:58:42 +0300
Subject: [PATCH 11/16] Intermediate changes
 commit_hash:f785655b6e4979e4b61af2cb8227296a279f7ab6

---
 contrib/python/httpcore/.dist-info/METADATA   |   9 +-
 contrib/python/httpcore/httpcore/__init__.py  |   5 +-
 contrib/python/httpcore/httpcore/_api.py      |  26 ++--
 .../httpcore/httpcore/_async/connection.py    |  30 ++--
 .../httpcore/_async/connection_pool.py        |  98 ++++++++----
 .../python/httpcore/httpcore/_async/http11.py |  59 ++++---
 .../python/httpcore/httpcore/_async/http2.py  |  50 +++---
 .../httpcore/httpcore/_async/http_proxy.py    |  73 +++++----
 .../httpcore/httpcore/_async/interfaces.py    |  26 ++--
 .../httpcore/httpcore/_async/socks_proxy.py   |  61 ++++----
 .../httpcore/httpcore/_backends/anyio.py      |  24 ++-
 .../httpcore/httpcore/_backends/auto.py       |  13 +-
 .../httpcore/httpcore/_backends/base.py       |  46 +++---
 .../httpcore/httpcore/_backends/mock.py       |  47 +++---
 .../httpcore/httpcore/_backends/sync.py       |  40 ++---
 .../httpcore/httpcore/_backends/trio.py       |  24 ++-
 .../python/httpcore/httpcore/_exceptions.py   |   6 +-
 contrib/python/httpcore/httpcore/_models.py   | 146 ++++++++++--------
 .../httpcore/httpcore/_sync/connection.py     |  30 ++--
 .../httpcore/_sync/connection_pool.py         |  98 ++++++++----
 .../python/httpcore/httpcore/_sync/http11.py  |  59 ++++---
 .../python/httpcore/httpcore/_sync/http2.py   |  50 +++---
 .../httpcore/httpcore/_sync/http_proxy.py     |  73 +++++----
 .../httpcore/httpcore/_sync/interfaces.py     |  26 ++--
 .../httpcore/httpcore/_sync/socks_proxy.py    |  61 ++++----
 .../httpcore/httpcore/_synchronization.py     |  57 +++----
 contrib/python/httpcore/httpcore/_trace.py    |  32 ++--
 contrib/python/httpcore/httpcore/_utils.py    |   5 +-
 contrib/python/httpcore/ya.make               |   2 +-
 29 files changed, 683 insertions(+), 593 deletions(-)

diff --git a/contrib/python/httpcore/.dist-info/METADATA b/contrib/python/httpcore/.dist-info/METADATA
index 6be8e4bb68c9..99be2236cdd5 100644
--- a/contrib/python/httpcore/.dist-info/METADATA
+++ b/contrib/python/httpcore/.dist-info/METADATA
@@ -1,13 +1,12 @@
 Metadata-Version: 2.3
 Name: httpcore
-Version: 1.0.6
+Version: 1.0.7
 Summary: A minimal low-level HTTP client.
 Project-URL: Documentation, https://www.encode.io/httpcore
 Project-URL: Homepage, https://www.encode.io/httpcore/
 Project-URL: Source, https://github.com/encode/httpcore
 Author-email: Tom Christie <tom@tomchristie.com>
-License-Expression: BSD-3-Clause
-License-File: LICENSE.md
+License: BSD-3-Clause
 Classifier: Development Status :: 3 - Alpha
 Classifier: Environment :: Web Environment
 Classifier: Framework :: AsyncIO
@@ -153,6 +152,10 @@ All notable changes to this project will be documented in this file.
 
 The format is based on [Keep a Changelog](https://keepachangelog.com/en/1.0.0/).
 
+## Version 1.0.7 (November 15th, 2024)
+
+- Support `proxy=…` configuration on `ConnectionPool()`. (#974)
+
 ## Version 1.0.6 (October 1st, 2024)
 
 - Relax `trio` dependency pinning. (#956)
diff --git a/contrib/python/httpcore/httpcore/__init__.py b/contrib/python/httpcore/httpcore/__init__.py
index 330745a5dca2..662b1563a1ef 100644
--- a/contrib/python/httpcore/httpcore/__init__.py
+++ b/contrib/python/httpcore/httpcore/__init__.py
@@ -34,7 +34,7 @@
     WriteError,
     WriteTimeout,
 )
-from ._models import URL, Origin, Request, Response
+from ._models import URL, Origin, Proxy, Request, Response
 from ._ssl import default_ssl_context
 from ._sync import (
     ConnectionInterface,
@@ -79,6 +79,7 @@ def __init__(self, *args, **kwargs):  # type: ignore
     "URL",
     "Request",
     "Response",
+    "Proxy",
     # async
     "AsyncHTTPConnection",
     "AsyncConnectionPool",
@@ -130,7 +131,7 @@ def __init__(self, *args, **kwargs):  # type: ignore
     "WriteError",
 ]
 
-__version__ = "1.0.6"
+__version__ = "1.0.7"
 
 
 __locals = locals()
diff --git a/contrib/python/httpcore/httpcore/_api.py b/contrib/python/httpcore/httpcore/_api.py
index 854235f5f603..38b961d10de8 100644
--- a/contrib/python/httpcore/httpcore/_api.py
+++ b/contrib/python/httpcore/httpcore/_api.py
@@ -1,17 +1,19 @@
-from contextlib import contextmanager
-from typing import Iterator, Optional, Union
+from __future__ import annotations
+
+import contextlib
+import typing
 
 from ._models import URL, Extensions, HeaderTypes, Response
 from ._sync.connection_pool import ConnectionPool
 
 
 def request(
-    method: Union[bytes, str],
-    url: Union[URL, bytes, str],
+    method: bytes | str,
+    url: URL | bytes | str,
     *,
     headers: HeaderTypes = None,
-    content: Union[bytes, Iterator[bytes], None] = None,
-    extensions: Optional[Extensions] = None,
+    content: bytes | typing.Iterator[bytes] | None = None,
+    extensions: Extensions | None = None,
 ) -> Response:
     """
     Sends an HTTP request, returning the response.
@@ -45,15 +47,15 @@ def request(
         )
 
 
-@contextmanager
+@contextlib.contextmanager
 def stream(
-    method: Union[bytes, str],
-    url: Union[URL, bytes, str],
+    method: bytes | str,
+    url: URL | bytes | str,
     *,
     headers: HeaderTypes = None,
-    content: Union[bytes, Iterator[bytes], None] = None,
-    extensions: Optional[Extensions] = None,
-) -> Iterator[Response]:
+    content: bytes | typing.Iterator[bytes] | None = None,
+    extensions: Extensions | None = None,
+) -> typing.Iterator[Response]:
     """
     Sends an HTTP request, returning the response within a content manager.
 
diff --git a/contrib/python/httpcore/httpcore/_async/connection.py b/contrib/python/httpcore/httpcore/_async/connection.py
index 2f439cf09c5c..b42581dff8aa 100644
--- a/contrib/python/httpcore/httpcore/_async/connection.py
+++ b/contrib/python/httpcore/httpcore/_async/connection.py
@@ -1,8 +1,10 @@
+from __future__ import annotations
+
 import itertools
 import logging
 import ssl
-from types import TracebackType
-from typing import Iterable, Iterator, Optional, Type
+import types
+import typing
 
 from .._backends.auto import AutoBackend
 from .._backends.base import SOCKET_OPTION, AsyncNetworkBackend, AsyncNetworkStream
@@ -20,7 +22,7 @@
 logger = logging.getLogger("httpcore.connection")
 
 
-def exponential_backoff(factor: float) -> Iterator[float]:
+def exponential_backoff(factor: float) -> typing.Iterator[float]:
     """
     Generate a geometric sequence that has a ratio of 2 and starts with 0.
 
@@ -37,15 +39,15 @@ class AsyncHTTPConnection(AsyncConnectionInterface):
     def __init__(
         self,
         origin: Origin,
-        ssl_context: Optional[ssl.SSLContext] = None,
-        keepalive_expiry: Optional[float] = None,
+        ssl_context: ssl.SSLContext | None = None,
+        keepalive_expiry: float | None = None,
         http1: bool = True,
         http2: bool = False,
         retries: int = 0,
-        local_address: Optional[str] = None,
-        uds: Optional[str] = None,
-        network_backend: Optional[AsyncNetworkBackend] = None,
-        socket_options: Optional[Iterable[SOCKET_OPTION]] = None,
+        local_address: str | None = None,
+        uds: str | None = None,
+        network_backend: AsyncNetworkBackend | None = None,
+        socket_options: typing.Iterable[SOCKET_OPTION] | None = None,
     ) -> None:
         self._origin = origin
         self._ssl_context = ssl_context
@@ -59,7 +61,7 @@ def __init__(
         self._network_backend: AsyncNetworkBackend = (
             AutoBackend() if network_backend is None else network_backend
         )
-        self._connection: Optional[AsyncConnectionInterface] = None
+        self._connection: AsyncConnectionInterface | None = None
         self._connect_failed: bool = False
         self._request_lock = AsyncLock()
         self._socket_options = socket_options
@@ -208,13 +210,13 @@ def __repr__(self) -> str:
     # These context managers are not used in the standard flow, but are
     # useful for testing or working with connection instances directly.
 
-    async def __aenter__(self) -> "AsyncHTTPConnection":
+    async def __aenter__(self) -> AsyncHTTPConnection:
         return self
 
     async def __aexit__(
         self,
-        exc_type: Optional[Type[BaseException]] = None,
-        exc_value: Optional[BaseException] = None,
-        traceback: Optional[TracebackType] = None,
+        exc_type: type[BaseException] | None = None,
+        exc_value: BaseException | None = None,
+        traceback: types.TracebackType | None = None,
     ) -> None:
         await self.aclose()
diff --git a/contrib/python/httpcore/httpcore/_async/connection_pool.py b/contrib/python/httpcore/httpcore/_async/connection_pool.py
index 214dfc4be43f..96e973d0ce22 100644
--- a/contrib/python/httpcore/httpcore/_async/connection_pool.py
+++ b/contrib/python/httpcore/httpcore/_async/connection_pool.py
@@ -1,12 +1,14 @@
+from __future__ import annotations
+
 import ssl
 import sys
-from types import TracebackType
-from typing import AsyncIterable, AsyncIterator, Iterable, List, Optional, Type
+import types
+import typing
 
 from .._backends.auto import AutoBackend
 from .._backends.base import SOCKET_OPTION, AsyncNetworkBackend
 from .._exceptions import ConnectionNotAvailable, UnsupportedProtocol
-from .._models import Origin, Request, Response
+from .._models import Origin, Proxy, Request, Response
 from .._synchronization import AsyncEvent, AsyncShieldCancellation, AsyncThreadLock
 from .connection import AsyncHTTPConnection
 from .interfaces import AsyncConnectionInterface, AsyncRequestInterface
@@ -15,12 +17,10 @@
 class AsyncPoolRequest:
     def __init__(self, request: Request) -> None:
         self.request = request
-        self.connection: Optional[AsyncConnectionInterface] = None
+        self.connection: AsyncConnectionInterface | None = None
         self._connection_acquired = AsyncEvent()
 
-    def assign_to_connection(
-        self, connection: Optional[AsyncConnectionInterface]
-    ) -> None:
+    def assign_to_connection(self, connection: AsyncConnectionInterface | None) -> None:
         self.connection = connection
         self._connection_acquired.set()
 
@@ -29,7 +29,7 @@ def clear_connection(self) -> None:
         self._connection_acquired = AsyncEvent()
 
     async def wait_for_connection(
-        self, timeout: Optional[float] = None
+        self, timeout: float | None = None
     ) -> AsyncConnectionInterface:
         if self.connection is None:
             await self._connection_acquired.wait(timeout=timeout)
@@ -47,17 +47,18 @@ class AsyncConnectionPool(AsyncRequestInterface):
 
     def __init__(
         self,
-        ssl_context: Optional[ssl.SSLContext] = None,
-        max_connections: Optional[int] = 10,
-        max_keepalive_connections: Optional[int] = None,
-        keepalive_expiry: Optional[float] = None,
+        ssl_context: ssl.SSLContext | None = None,
+        proxy: Proxy | None = None,
+        max_connections: int | None = 10,
+        max_keepalive_connections: int | None = None,
+        keepalive_expiry: float | None = None,
         http1: bool = True,
         http2: bool = False,
         retries: int = 0,
-        local_address: Optional[str] = None,
-        uds: Optional[str] = None,
-        network_backend: Optional[AsyncNetworkBackend] = None,
-        socket_options: Optional[Iterable[SOCKET_OPTION]] = None,
+        local_address: str | None = None,
+        uds: str | None = None,
+        network_backend: AsyncNetworkBackend | None = None,
+        socket_options: typing.Iterable[SOCKET_OPTION] | None = None,
     ) -> None:
         """
         A connection pool for making HTTP requests.
@@ -89,7 +90,7 @@ def __init__(
              in the TCP socket when the connection was established.
         """
         self._ssl_context = ssl_context
-
+        self._proxy = proxy
         self._max_connections = (
             sys.maxsize if max_connections is None else max_connections
         )
@@ -116,8 +117,8 @@ def __init__(
 
         # The mutable state on a connection pool is the queue of incoming requests,
         # and the set of connections that are servicing those requests.
-        self._connections: List[AsyncConnectionInterface] = []
-        self._requests: List[AsyncPoolRequest] = []
+        self._connections: list[AsyncConnectionInterface] = []
+        self._requests: list[AsyncPoolRequest] = []
 
         # We only mutate the state of the connection pool within an 'optional_thread_lock'
         # context. This holds a threading lock unless we're running in async mode,
@@ -125,6 +126,45 @@ def __init__(
         self._optional_thread_lock = AsyncThreadLock()
 
     def create_connection(self, origin: Origin) -> AsyncConnectionInterface:
+        if self._proxy is not None:
+            if self._proxy.url.scheme in (b"socks5", b"socks5h"):
+                from .socks_proxy import AsyncSocks5Connection
+
+                return AsyncSocks5Connection(
+                    proxy_origin=self._proxy.url.origin,
+                    proxy_auth=self._proxy.auth,
+                    remote_origin=origin,
+                    ssl_context=self._ssl_context,
+                    keepalive_expiry=self._keepalive_expiry,
+                    http1=self._http1,
+                    http2=self._http2,
+                    network_backend=self._network_backend,
+                )
+            elif origin.scheme == b"http":
+                from .http_proxy import AsyncForwardHTTPConnection
+
+                return AsyncForwardHTTPConnection(
+                    proxy_origin=self._proxy.url.origin,
+                    proxy_headers=self._proxy.headers,
+                    proxy_ssl_context=self._proxy.ssl_context,
+                    remote_origin=origin,
+                    keepalive_expiry=self._keepalive_expiry,
+                    network_backend=self._network_backend,
+                )
+            from .http_proxy import AsyncTunnelHTTPConnection
+
+            return AsyncTunnelHTTPConnection(
+                proxy_origin=self._proxy.url.origin,
+                proxy_headers=self._proxy.headers,
+                proxy_ssl_context=self._proxy.ssl_context,
+                remote_origin=origin,
+                ssl_context=self._ssl_context,
+                keepalive_expiry=self._keepalive_expiry,
+                http1=self._http1,
+                http2=self._http2,
+                network_backend=self._network_backend,
+            )
+
         return AsyncHTTPConnection(
             origin=origin,
             ssl_context=self._ssl_context,
@@ -139,7 +179,7 @@ def create_connection(self, origin: Origin) -> AsyncConnectionInterface:
         )
 
     @property
-    def connections(self) -> List[AsyncConnectionInterface]:
+    def connections(self) -> list[AsyncConnectionInterface]:
         """
         Return a list of the connections currently in the pool.
 
@@ -217,7 +257,7 @@ async def handle_async_request(self, request: Request) -> Response:
 
         # Return the response. Note that in this case we still have to manage
         # the point at which the response is closed.
-        assert isinstance(response.stream, AsyncIterable)
+        assert isinstance(response.stream, typing.AsyncIterable)
         return Response(
             status=response.status,
             headers=response.headers,
@@ -227,7 +267,7 @@ async def handle_async_request(self, request: Request) -> Response:
             extensions=response.extensions,
         )
 
-    def _assign_requests_to_connections(self) -> List[AsyncConnectionInterface]:
+    def _assign_requests_to_connections(self) -> list[AsyncConnectionInterface]:
         """
         Manage the state of the connection pool, assigning incoming
         requests to connections as available.
@@ -298,7 +338,7 @@ def _assign_requests_to_connections(self) -> List[AsyncConnectionInterface]:
 
         return closing_connections
 
-    async def _close_connections(self, closing: List[AsyncConnectionInterface]) -> None:
+    async def _close_connections(self, closing: list[AsyncConnectionInterface]) -> None:
         # Close connections which have been removed from the pool.
         with AsyncShieldCancellation():
             for connection in closing:
@@ -312,14 +352,14 @@ async def aclose(self) -> None:
             self._connections = []
         await self._close_connections(closing_connections)
 
-    async def __aenter__(self) -> "AsyncConnectionPool":
+    async def __aenter__(self) -> AsyncConnectionPool:
         return self
 
     async def __aexit__(
         self,
-        exc_type: Optional[Type[BaseException]] = None,
-        exc_value: Optional[BaseException] = None,
-        traceback: Optional[TracebackType] = None,
+        exc_type: type[BaseException] | None = None,
+        exc_value: BaseException | None = None,
+        traceback: types.TracebackType | None = None,
     ) -> None:
         await self.aclose()
 
@@ -349,7 +389,7 @@ def __repr__(self) -> str:
 class PoolByteStream:
     def __init__(
         self,
-        stream: AsyncIterable[bytes],
+        stream: typing.AsyncIterable[bytes],
         pool_request: AsyncPoolRequest,
         pool: AsyncConnectionPool,
     ) -> None:
@@ -358,7 +398,7 @@ def __init__(
         self._pool = pool
         self._closed = False
 
-    async def __aiter__(self) -> AsyncIterator[bytes]:
+    async def __aiter__(self) -> typing.AsyncIterator[bytes]:
         try:
             async for part in self._stream:
                 yield part
diff --git a/contrib/python/httpcore/httpcore/_async/http11.py b/contrib/python/httpcore/httpcore/_async/http11.py
index 0493a923dc19..e6d6d709852b 100644
--- a/contrib/python/httpcore/httpcore/_async/http11.py
+++ b/contrib/python/httpcore/httpcore/_async/http11.py
@@ -1,18 +1,11 @@
+from __future__ import annotations
+
 import enum
 import logging
 import ssl
 import time
-from types import TracebackType
-from typing import (
-    Any,
-    AsyncIterable,
-    AsyncIterator,
-    List,
-    Optional,
-    Tuple,
-    Type,
-    Union,
-)
+import types
+import typing
 
 import h11
 
@@ -33,7 +26,7 @@
 
 
 # A subset of `h11.Event` types supported by `_send_event`
-H11SendEvent = Union[
+H11SendEvent = typing.Union[
     h11.Request,
     h11.Data,
     h11.EndOfMessage,
@@ -55,12 +48,12 @@ def __init__(
         self,
         origin: Origin,
         stream: AsyncNetworkStream,
-        keepalive_expiry: Optional[float] = None,
+        keepalive_expiry: float | None = None,
     ) -> None:
         self._origin = origin
         self._network_stream = stream
-        self._keepalive_expiry: Optional[float] = keepalive_expiry
-        self._expire_at: Optional[float] = None
+        self._keepalive_expiry: float | None = keepalive_expiry
+        self._expire_at: float | None = None
         self._state = HTTPConnectionState.NEW
         self._state_lock = AsyncLock()
         self._request_count = 0
@@ -160,16 +153,14 @@ async def _send_request_body(self, request: Request) -> None:
         timeouts = request.extensions.get("timeout", {})
         timeout = timeouts.get("write", None)
 
-        assert isinstance(request.stream, AsyncIterable)
+        assert isinstance(request.stream, typing.AsyncIterable)
         async for chunk in request.stream:
             event = h11.Data(data=chunk)
             await self._send_event(event, timeout=timeout)
 
         await self._send_event(h11.EndOfMessage(), timeout=timeout)
 
-    async def _send_event(
-        self, event: h11.Event, timeout: Optional[float] = None
-    ) -> None:
+    async def _send_event(self, event: h11.Event, timeout: float | None = None) -> None:
         bytes_to_send = self._h11_state.send(event)
         if bytes_to_send is not None:
             await self._network_stream.write(bytes_to_send, timeout=timeout)
@@ -178,7 +169,7 @@ async def _send_event(
 
     async def _receive_response_headers(
         self, request: Request
-    ) -> Tuple[bytes, int, bytes, List[Tuple[bytes, bytes]], bytes]:
+    ) -> tuple[bytes, int, bytes, list[tuple[bytes, bytes]], bytes]:
         timeouts = request.extensions.get("timeout", {})
         timeout = timeouts.get("read", None)
 
@@ -202,7 +193,9 @@ async def _receive_response_headers(
 
         return http_version, event.status_code, event.reason, headers, trailing_data
 
-    async def _receive_response_body(self, request: Request) -> AsyncIterator[bytes]:
+    async def _receive_response_body(
+        self, request: Request
+    ) -> typing.AsyncIterator[bytes]:
         timeouts = request.extensions.get("timeout", {})
         timeout = timeouts.get("read", None)
 
@@ -214,8 +207,8 @@ async def _receive_response_body(self, request: Request) -> AsyncIterator[bytes]
                 break
 
     async def _receive_event(
-        self, timeout: Optional[float] = None
-    ) -> Union[h11.Event, Type[h11.PAUSED]]:
+        self, timeout: float | None = None
+    ) -> h11.Event | type[h11.PAUSED]:
         while True:
             with map_exceptions({h11.RemoteProtocolError: RemoteProtocolError}):
                 event = self._h11_state.next_event()
@@ -316,14 +309,14 @@ def __repr__(self) -> str:
     # These context managers are not used in the standard flow, but are
     # useful for testing or working with connection instances directly.
 
-    async def __aenter__(self) -> "AsyncHTTP11Connection":
+    async def __aenter__(self) -> AsyncHTTP11Connection:
         return self
 
     async def __aexit__(
         self,
-        exc_type: Optional[Type[BaseException]] = None,
-        exc_value: Optional[BaseException] = None,
-        traceback: Optional[TracebackType] = None,
+        exc_type: type[BaseException] | None = None,
+        exc_value: BaseException | None = None,
+        traceback: types.TracebackType | None = None,
     ) -> None:
         await self.aclose()
 
@@ -334,7 +327,7 @@ def __init__(self, connection: AsyncHTTP11Connection, request: Request) -> None:
         self._request = request
         self._closed = False
 
-    async def __aiter__(self) -> AsyncIterator[bytes]:
+    async def __aiter__(self) -> typing.AsyncIterator[bytes]:
         kwargs = {"request": self._request}
         try:
             async with Trace("receive_response_body", logger, self._request, kwargs):
@@ -360,7 +353,7 @@ def __init__(self, stream: AsyncNetworkStream, leading_data: bytes) -> None:
         self._stream = stream
         self._leading_data = leading_data
 
-    async def read(self, max_bytes: int, timeout: Optional[float] = None) -> bytes:
+    async def read(self, max_bytes: int, timeout: float | None = None) -> bytes:
         if self._leading_data:
             buffer = self._leading_data[:max_bytes]
             self._leading_data = self._leading_data[max_bytes:]
@@ -368,7 +361,7 @@ async def read(self, max_bytes: int, timeout: Optional[float] = None) -> bytes:
         else:
             return await self._stream.read(max_bytes, timeout)
 
-    async def write(self, buffer: bytes, timeout: Optional[float] = None) -> None:
+    async def write(self, buffer: bytes, timeout: float | None = None) -> None:
         await self._stream.write(buffer, timeout)
 
     async def aclose(self) -> None:
@@ -377,10 +370,10 @@ async def aclose(self) -> None:
     async def start_tls(
         self,
         ssl_context: ssl.SSLContext,
-        server_hostname: Optional[str] = None,
-        timeout: Optional[float] = None,
+        server_hostname: str | None = None,
+        timeout: float | None = None,
     ) -> AsyncNetworkStream:
         return await self._stream.start_tls(ssl_context, server_hostname, timeout)
 
-    def get_extra_info(self, info: str) -> Any:
+    def get_extra_info(self, info: str) -> typing.Any:
         return self._stream.get_extra_info(info)
diff --git a/contrib/python/httpcore/httpcore/_async/http2.py b/contrib/python/httpcore/httpcore/_async/http2.py
index c201ee4cbcfb..c6434a049696 100644
--- a/contrib/python/httpcore/httpcore/_async/http2.py
+++ b/contrib/python/httpcore/httpcore/_async/http2.py
@@ -1,3 +1,5 @@
+from __future__ import annotations
+
 import enum
 import logging
 import time
@@ -45,14 +47,14 @@ def __init__(
         self,
         origin: Origin,
         stream: AsyncNetworkStream,
-        keepalive_expiry: typing.Optional[float] = None,
+        keepalive_expiry: float | None = None,
     ):
         self._origin = origin
         self._network_stream = stream
-        self._keepalive_expiry: typing.Optional[float] = keepalive_expiry
+        self._keepalive_expiry: float | None = keepalive_expiry
         self._h2_state = h2.connection.H2Connection(config=self.CONFIG)
         self._state = HTTPConnectionState.IDLE
-        self._expire_at: typing.Optional[float] = None
+        self._expire_at: float | None = None
         self._request_count = 0
         self._init_lock = AsyncLock()
         self._state_lock = AsyncLock()
@@ -63,24 +65,20 @@ def __init__(
         self._connection_error = False
 
         # Mapping from stream ID to response stream events.
-        self._events: typing.Dict[
+        self._events: dict[
             int,
-            typing.Union[
-                h2.events.ResponseReceived,
-                h2.events.DataReceived,
-                h2.events.StreamEnded,
-                h2.events.StreamReset,
-            ],
+            h2.events.ResponseReceived
+            | h2.events.DataReceived
+            | h2.events.StreamEnded
+            | h2.events.StreamReset,
         ] = {}
 
         # Connection terminated events are stored as state since
         # we need to handle them for all streams.
-        self._connection_terminated: typing.Optional[h2.events.ConnectionTerminated] = (
-            None
-        )
+        self._connection_terminated: h2.events.ConnectionTerminated | None = None
 
-        self._read_exception: typing.Optional[Exception] = None
-        self._write_exception: typing.Optional[Exception] = None
+        self._read_exception: Exception | None = None
+        self._write_exception: Exception | None = None
 
     async def handle_async_request(self, request: Request) -> Response:
         if not self.can_handle_request(request.url.origin):
@@ -284,7 +282,7 @@ async def _send_end_stream(self, request: Request, stream_id: int) -> None:
 
     async def _receive_response(
         self, request: Request, stream_id: int
-    ) -> typing.Tuple[int, typing.List[typing.Tuple[bytes, bytes]]]:
+    ) -> tuple[int, list[tuple[bytes, bytes]]]:
         """
         Return the response status code and headers for a given stream ID.
         """
@@ -321,9 +319,7 @@ async def _receive_response_body(
 
     async def _receive_stream_event(
         self, request: Request, stream_id: int
-    ) -> typing.Union[
-        h2.events.ResponseReceived, h2.events.DataReceived, h2.events.StreamEnded
-    ]:
+    ) -> h2.events.ResponseReceived | h2.events.DataReceived | h2.events.StreamEnded:
         """
         Return the next available event for a given stream ID.
 
@@ -337,7 +333,7 @@ async def _receive_stream_event(
         return event
 
     async def _receive_events(
-        self, request: Request, stream_id: typing.Optional[int] = None
+        self, request: Request, stream_id: int | None = None
     ) -> None:
         """
         Read some data from the network until we see one or more events
@@ -425,9 +421,7 @@ async def aclose(self) -> None:
 
     # Wrappers around network read/write operations...
 
-    async def _read_incoming_data(
-        self, request: Request
-    ) -> typing.List[h2.events.Event]:
+    async def _read_incoming_data(self, request: Request) -> list[h2.events.Event]:
         timeouts = request.extensions.get("timeout", {})
         timeout = timeouts.get("read", None)
 
@@ -451,7 +445,7 @@ async def _read_incoming_data(
             self._connection_error = True
             raise exc
 
-        events: typing.List[h2.events.Event] = self._h2_state.receive_data(data)
+        events: list[h2.events.Event] = self._h2_state.receive_data(data)
 
         return events
 
@@ -544,14 +538,14 @@ def __repr__(self) -> str:
     # These context managers are not used in the standard flow, but are
     # useful for testing or working with connection instances directly.
 
-    async def __aenter__(self) -> "AsyncHTTP2Connection":
+    async def __aenter__(self) -> AsyncHTTP2Connection:
         return self
 
     async def __aexit__(
         self,
-        exc_type: typing.Optional[typing.Type[BaseException]] = None,
-        exc_value: typing.Optional[BaseException] = None,
-        traceback: typing.Optional[types.TracebackType] = None,
+        exc_type: type[BaseException] | None = None,
+        exc_value: BaseException | None = None,
+        traceback: types.TracebackType | None = None,
     ) -> None:
         await self.aclose()
 
diff --git a/contrib/python/httpcore/httpcore/_async/http_proxy.py b/contrib/python/httpcore/httpcore/_async/http_proxy.py
index 4aa7d8741a95..cc9d92066e16 100644
--- a/contrib/python/httpcore/httpcore/_async/http_proxy.py
+++ b/contrib/python/httpcore/httpcore/_async/http_proxy.py
@@ -1,7 +1,9 @@
+from __future__ import annotations
+
+import base64
 import logging
 import ssl
-from base64 import b64encode
-from typing import Iterable, List, Mapping, Optional, Sequence, Tuple, Union
+import typing
 
 from .._backends.base import SOCKET_OPTION, AsyncNetworkBackend
 from .._exceptions import ProxyError
@@ -22,17 +24,18 @@
 from .http11 import AsyncHTTP11Connection
 from .interfaces import AsyncConnectionInterface
 
-HeadersAsSequence = Sequence[Tuple[Union[bytes, str], Union[bytes, str]]]
-HeadersAsMapping = Mapping[Union[bytes, str], Union[bytes, str]]
+ByteOrStr = typing.Union[bytes, str]
+HeadersAsSequence = typing.Sequence[typing.Tuple[ByteOrStr, ByteOrStr]]
+HeadersAsMapping = typing.Mapping[ByteOrStr, ByteOrStr]
 
 
 logger = logging.getLogger("httpcore.proxy")
 
 
 def merge_headers(
-    default_headers: Optional[Sequence[Tuple[bytes, bytes]]] = None,
-    override_headers: Optional[Sequence[Tuple[bytes, bytes]]] = None,
-) -> List[Tuple[bytes, bytes]]:
+    default_headers: typing.Sequence[tuple[bytes, bytes]] | None = None,
+    override_headers: typing.Sequence[tuple[bytes, bytes]] | None = None,
+) -> list[tuple[bytes, bytes]]:
     """
     Append default_headers and override_headers, de-duplicating if a key exists
     in both cases.
@@ -48,33 +51,28 @@ def merge_headers(
     return default_headers + override_headers
 
 
-def build_auth_header(username: bytes, password: bytes) -> bytes:
-    userpass = username + b":" + password
-    return b"Basic " + b64encode(userpass)
-
-
-class AsyncHTTPProxy(AsyncConnectionPool):
+class AsyncHTTPProxy(AsyncConnectionPool):  # pragma: nocover
     """
     A connection pool that sends requests via an HTTP proxy.
     """
 
     def __init__(
         self,
-        proxy_url: Union[URL, bytes, str],
-        proxy_auth: Optional[Tuple[Union[bytes, str], Union[bytes, str]]] = None,
-        proxy_headers: Union[HeadersAsMapping, HeadersAsSequence, None] = None,
-        ssl_context: Optional[ssl.SSLContext] = None,
-        proxy_ssl_context: Optional[ssl.SSLContext] = None,
-        max_connections: Optional[int] = 10,
-        max_keepalive_connections: Optional[int] = None,
-        keepalive_expiry: Optional[float] = None,
+        proxy_url: URL | bytes | str,
+        proxy_auth: tuple[bytes | str, bytes | str] | None = None,
+        proxy_headers: HeadersAsMapping | HeadersAsSequence | None = None,
+        ssl_context: ssl.SSLContext | None = None,
+        proxy_ssl_context: ssl.SSLContext | None = None,
+        max_connections: int | None = 10,
+        max_keepalive_connections: int | None = None,
+        keepalive_expiry: float | None = None,
         http1: bool = True,
         http2: bool = False,
         retries: int = 0,
-        local_address: Optional[str] = None,
-        uds: Optional[str] = None,
-        network_backend: Optional[AsyncNetworkBackend] = None,
-        socket_options: Optional[Iterable[SOCKET_OPTION]] = None,
+        local_address: str | None = None,
+        uds: str | None = None,
+        network_backend: AsyncNetworkBackend | None = None,
+        socket_options: typing.Iterable[SOCKET_OPTION] | None = None,
     ) -> None:
         """
         A connection pool for making HTTP requests.
@@ -139,7 +137,8 @@ def __init__(
         if proxy_auth is not None:
             username = enforce_bytes(proxy_auth[0], name="proxy_auth")
             password = enforce_bytes(proxy_auth[1], name="proxy_auth")
-            authorization = build_auth_header(username, password)
+            userpass = username + b":" + password
+            authorization = b"Basic " + base64.b64encode(userpass)
             self._proxy_headers = [
                 (b"Proxy-Authorization", authorization)
             ] + self._proxy_headers
@@ -172,11 +171,11 @@ def __init__(
         self,
         proxy_origin: Origin,
         remote_origin: Origin,
-        proxy_headers: Union[HeadersAsMapping, HeadersAsSequence, None] = None,
-        keepalive_expiry: Optional[float] = None,
-        network_backend: Optional[AsyncNetworkBackend] = None,
-        socket_options: Optional[Iterable[SOCKET_OPTION]] = None,
-        proxy_ssl_context: Optional[ssl.SSLContext] = None,
+        proxy_headers: HeadersAsMapping | HeadersAsSequence | None = None,
+        keepalive_expiry: float | None = None,
+        network_backend: AsyncNetworkBackend | None = None,
+        socket_options: typing.Iterable[SOCKET_OPTION] | None = None,
+        proxy_ssl_context: ssl.SSLContext | None = None,
     ) -> None:
         self._connection = AsyncHTTPConnection(
             origin=proxy_origin,
@@ -236,14 +235,14 @@ def __init__(
         self,
         proxy_origin: Origin,
         remote_origin: Origin,
-        ssl_context: Optional[ssl.SSLContext] = None,
-        proxy_ssl_context: Optional[ssl.SSLContext] = None,
-        proxy_headers: Optional[Sequence[Tuple[bytes, bytes]]] = None,
-        keepalive_expiry: Optional[float] = None,
+        ssl_context: ssl.SSLContext | None = None,
+        proxy_ssl_context: ssl.SSLContext | None = None,
+        proxy_headers: typing.Sequence[tuple[bytes, bytes]] | None = None,
+        keepalive_expiry: float | None = None,
         http1: bool = True,
         http2: bool = False,
-        network_backend: Optional[AsyncNetworkBackend] = None,
-        socket_options: Optional[Iterable[SOCKET_OPTION]] = None,
+        network_backend: AsyncNetworkBackend | None = None,
+        socket_options: typing.Iterable[SOCKET_OPTION] | None = None,
     ) -> None:
         self._connection: AsyncConnectionInterface = AsyncHTTPConnection(
             origin=proxy_origin,
diff --git a/contrib/python/httpcore/httpcore/_async/interfaces.py b/contrib/python/httpcore/httpcore/_async/interfaces.py
index c998dd276326..361583bede6b 100644
--- a/contrib/python/httpcore/httpcore/_async/interfaces.py
+++ b/contrib/python/httpcore/httpcore/_async/interfaces.py
@@ -1,5 +1,7 @@
-from contextlib import asynccontextmanager
-from typing import AsyncIterator, Optional, Union
+from __future__ import annotations
+
+import contextlib
+import typing
 
 from .._models import (
     URL,
@@ -18,12 +20,12 @@
 class AsyncRequestInterface:
     async def request(
         self,
-        method: Union[bytes, str],
-        url: Union[URL, bytes, str],
+        method: bytes | str,
+        url: URL | bytes | str,
         *,
         headers: HeaderTypes = None,
-        content: Union[bytes, AsyncIterator[bytes], None] = None,
-        extensions: Optional[Extensions] = None,
+        content: bytes | typing.AsyncIterator[bytes] | None = None,
+        extensions: Extensions | None = None,
     ) -> Response:
         # Strict type checking on our parameters.
         method = enforce_bytes(method, name="method")
@@ -47,16 +49,16 @@ async def request(
             await response.aclose()
         return response
 
-    @asynccontextmanager
+    @contextlib.asynccontextmanager
     async def stream(
         self,
-        method: Union[bytes, str],
-        url: Union[URL, bytes, str],
+        method: bytes | str,
+        url: URL | bytes | str,
         *,
         headers: HeaderTypes = None,
-        content: Union[bytes, AsyncIterator[bytes], None] = None,
-        extensions: Optional[Extensions] = None,
-    ) -> AsyncIterator[Response]:
+        content: bytes | typing.AsyncIterator[bytes] | None = None,
+        extensions: Extensions | None = None,
+    ) -> typing.AsyncIterator[Response]:
         # Strict type checking on our parameters.
         method = enforce_bytes(method, name="method")
         url = enforce_url(url, name="url")
diff --git a/contrib/python/httpcore/httpcore/_async/socks_proxy.py b/contrib/python/httpcore/httpcore/_async/socks_proxy.py
index f839603fe50c..b363f55a0b07 100644
--- a/contrib/python/httpcore/httpcore/_async/socks_proxy.py
+++ b/contrib/python/httpcore/httpcore/_async/socks_proxy.py
@@ -1,8 +1,9 @@
+from __future__ import annotations
+
 import logging
 import ssl
-import typing
 
-from socksio import socks5
+import socksio
 
 from .._backends.auto import AutoBackend
 from .._backends.base import AsyncNetworkBackend, AsyncNetworkStream
@@ -43,24 +44,24 @@ async def _init_socks5_connection(
     *,
     host: bytes,
     port: int,
-    auth: typing.Optional[typing.Tuple[bytes, bytes]] = None,
+    auth: tuple[bytes, bytes] | None = None,
 ) -> None:
-    conn = socks5.SOCKS5Connection()
+    conn = socksio.socks5.SOCKS5Connection()
 
     # Auth method request
     auth_method = (
-        socks5.SOCKS5AuthMethod.NO_AUTH_REQUIRED
+        socksio.socks5.SOCKS5AuthMethod.NO_AUTH_REQUIRED
         if auth is None
-        else socks5.SOCKS5AuthMethod.USERNAME_PASSWORD
+        else socksio.socks5.SOCKS5AuthMethod.USERNAME_PASSWORD
     )
-    conn.send(socks5.SOCKS5AuthMethodsRequest([auth_method]))
+    conn.send(socksio.socks5.SOCKS5AuthMethodsRequest([auth_method]))
     outgoing_bytes = conn.data_to_send()
     await stream.write(outgoing_bytes)
 
     # Auth method response
     incoming_bytes = await stream.read(max_bytes=4096)
     response = conn.receive_data(incoming_bytes)
-    assert isinstance(response, socks5.SOCKS5AuthReply)
+    assert isinstance(response, socksio.socks5.SOCKS5AuthReply)
     if response.method != auth_method:
         requested = AUTH_METHODS.get(auth_method, "UNKNOWN")
         responded = AUTH_METHODS.get(response.method, "UNKNOWN")
@@ -68,25 +69,25 @@ async def _init_socks5_connection(
             f"Requested {requested} from proxy server, but got {responded}."
         )
 
-    if response.method == socks5.SOCKS5AuthMethod.USERNAME_PASSWORD:
+    if response.method == socksio.socks5.SOCKS5AuthMethod.USERNAME_PASSWORD:
         # Username/password request
         assert auth is not None
         username, password = auth
-        conn.send(socks5.SOCKS5UsernamePasswordRequest(username, password))
+        conn.send(socksio.socks5.SOCKS5UsernamePasswordRequest(username, password))
         outgoing_bytes = conn.data_to_send()
         await stream.write(outgoing_bytes)
 
         # Username/password response
         incoming_bytes = await stream.read(max_bytes=4096)
         response = conn.receive_data(incoming_bytes)
-        assert isinstance(response, socks5.SOCKS5UsernamePasswordReply)
+        assert isinstance(response, socksio.socks5.SOCKS5UsernamePasswordReply)
         if not response.success:
             raise ProxyError("Invalid username/password")
 
     # Connect request
     conn.send(
-        socks5.SOCKS5CommandRequest.from_address(
-            socks5.SOCKS5Command.CONNECT, (host, port)
+        socksio.socks5.SOCKS5CommandRequest.from_address(
+            socksio.socks5.SOCKS5Command.CONNECT, (host, port)
         )
     )
     outgoing_bytes = conn.data_to_send()
@@ -95,31 +96,29 @@ async def _init_socks5_connection(
     # Connect response
     incoming_bytes = await stream.read(max_bytes=4096)
     response = conn.receive_data(incoming_bytes)
-    assert isinstance(response, socks5.SOCKS5Reply)
-    if response.reply_code != socks5.SOCKS5ReplyCode.SUCCEEDED:
+    assert isinstance(response, socksio.socks5.SOCKS5Reply)
+    if response.reply_code != socksio.socks5.SOCKS5ReplyCode.SUCCEEDED:
         reply_code = REPLY_CODES.get(response.reply_code, "UNKOWN")
         raise ProxyError(f"Proxy Server could not connect: {reply_code}.")
 
 
-class AsyncSOCKSProxy(AsyncConnectionPool):
+class AsyncSOCKSProxy(AsyncConnectionPool):  # pragma: nocover
     """
     A connection pool that sends requests via an HTTP proxy.
     """
 
     def __init__(
         self,
-        proxy_url: typing.Union[URL, bytes, str],
-        proxy_auth: typing.Optional[
-            typing.Tuple[typing.Union[bytes, str], typing.Union[bytes, str]]
-        ] = None,
-        ssl_context: typing.Optional[ssl.SSLContext] = None,
-        max_connections: typing.Optional[int] = 10,
-        max_keepalive_connections: typing.Optional[int] = None,
-        keepalive_expiry: typing.Optional[float] = None,
+        proxy_url: URL | bytes | str,
+        proxy_auth: tuple[bytes | str, bytes | str] | None = None,
+        ssl_context: ssl.SSLContext | None = None,
+        max_connections: int | None = 10,
+        max_keepalive_connections: int | None = None,
+        keepalive_expiry: float | None = None,
         http1: bool = True,
         http2: bool = False,
         retries: int = 0,
-        network_backend: typing.Optional[AsyncNetworkBackend] = None,
+        network_backend: AsyncNetworkBackend | None = None,
     ) -> None:
         """
         A connection pool for making HTTP requests.
@@ -167,7 +166,7 @@ def __init__(
             username, password = proxy_auth
             username_bytes = enforce_bytes(username, name="proxy_auth")
             password_bytes = enforce_bytes(password, name="proxy_auth")
-            self._proxy_auth: typing.Optional[typing.Tuple[bytes, bytes]] = (
+            self._proxy_auth: tuple[bytes, bytes] | None = (
                 username_bytes,
                 password_bytes,
             )
@@ -192,12 +191,12 @@ def __init__(
         self,
         proxy_origin: Origin,
         remote_origin: Origin,
-        proxy_auth: typing.Optional[typing.Tuple[bytes, bytes]] = None,
-        ssl_context: typing.Optional[ssl.SSLContext] = None,
-        keepalive_expiry: typing.Optional[float] = None,
+        proxy_auth: tuple[bytes, bytes] | None = None,
+        ssl_context: ssl.SSLContext | None = None,
+        keepalive_expiry: float | None = None,
         http1: bool = True,
         http2: bool = False,
-        network_backend: typing.Optional[AsyncNetworkBackend] = None,
+        network_backend: AsyncNetworkBackend | None = None,
     ) -> None:
         self._proxy_origin = proxy_origin
         self._remote_origin = remote_origin
@@ -211,7 +210,7 @@ def __init__(
             AutoBackend() if network_backend is None else network_backend
         )
         self._connect_lock = AsyncLock()
-        self._connection: typing.Optional[AsyncConnectionInterface] = None
+        self._connection: AsyncConnectionInterface | None = None
         self._connect_failed = False
 
     async def handle_async_request(self, request: Request) -> Response:
diff --git a/contrib/python/httpcore/httpcore/_backends/anyio.py b/contrib/python/httpcore/httpcore/_backends/anyio.py
index d469e0084cf0..a140095e1b8d 100644
--- a/contrib/python/httpcore/httpcore/_backends/anyio.py
+++ b/contrib/python/httpcore/httpcore/_backends/anyio.py
@@ -1,3 +1,5 @@
+from __future__ import annotations
+
 import ssl
 import typing
 
@@ -20,9 +22,7 @@ class AnyIOStream(AsyncNetworkStream):
     def __init__(self, stream: anyio.abc.ByteStream) -> None:
         self._stream = stream
 
-    async def read(
-        self, max_bytes: int, timeout: typing.Optional[float] = None
-    ) -> bytes:
+    async def read(self, max_bytes: int, timeout: float | None = None) -> bytes:
         exc_map = {
             TimeoutError: ReadTimeout,
             anyio.BrokenResourceError: ReadError,
@@ -36,9 +36,7 @@ async def read(
                 except anyio.EndOfStream:  # pragma: nocover
                     return b""
 
-    async def write(
-        self, buffer: bytes, timeout: typing.Optional[float] = None
-    ) -> None:
+    async def write(self, buffer: bytes, timeout: float | None = None) -> None:
         if not buffer:
             return
 
@@ -57,8 +55,8 @@ async def aclose(self) -> None:
     async def start_tls(
         self,
         ssl_context: ssl.SSLContext,
-        server_hostname: typing.Optional[str] = None,
-        timeout: typing.Optional[float] = None,
+        server_hostname: str | None = None,
+        timeout: float | None = None,
     ) -> AsyncNetworkStream:
         exc_map = {
             TimeoutError: ConnectTimeout,
@@ -101,9 +99,9 @@ async def connect_tcp(
         self,
         host: str,
         port: int,
-        timeout: typing.Optional[float] = None,
-        local_address: typing.Optional[str] = None,
-        socket_options: typing.Optional[typing.Iterable[SOCKET_OPTION]] = None,
+        timeout: float | None = None,
+        local_address: str | None = None,
+        socket_options: typing.Iterable[SOCKET_OPTION] | None = None,
     ) -> AsyncNetworkStream:  # pragma: nocover
         if socket_options is None:
             socket_options = []
@@ -127,8 +125,8 @@ async def connect_tcp(
     async def connect_unix_socket(
         self,
         path: str,
-        timeout: typing.Optional[float] = None,
-        socket_options: typing.Optional[typing.Iterable[SOCKET_OPTION]] = None,
+        timeout: float | None = None,
+        socket_options: typing.Iterable[SOCKET_OPTION] | None = None,
     ) -> AsyncNetworkStream:  # pragma: nocover
         if socket_options is None:
             socket_options = []
diff --git a/contrib/python/httpcore/httpcore/_backends/auto.py b/contrib/python/httpcore/httpcore/_backends/auto.py
index 3ac05f4da0b1..49f0e698c97a 100644
--- a/contrib/python/httpcore/httpcore/_backends/auto.py
+++ b/contrib/python/httpcore/httpcore/_backends/auto.py
@@ -1,5 +1,6 @@
+from __future__ import annotations
+
 import typing
-from typing import Optional
 
 from .._synchronization import current_async_library
 from .base import SOCKET_OPTION, AsyncNetworkBackend, AsyncNetworkStream
@@ -22,9 +23,9 @@ async def connect_tcp(
         self,
         host: str,
         port: int,
-        timeout: Optional[float] = None,
-        local_address: Optional[str] = None,
-        socket_options: typing.Optional[typing.Iterable[SOCKET_OPTION]] = None,
+        timeout: float | None = None,
+        local_address: str | None = None,
+        socket_options: typing.Iterable[SOCKET_OPTION] | None = None,
     ) -> AsyncNetworkStream:
         await self._init_backend()
         return await self._backend.connect_tcp(
@@ -38,8 +39,8 @@ async def connect_tcp(
     async def connect_unix_socket(
         self,
         path: str,
-        timeout: Optional[float] = None,
-        socket_options: typing.Optional[typing.Iterable[SOCKET_OPTION]] = None,
+        timeout: float | None = None,
+        socket_options: typing.Iterable[SOCKET_OPTION] | None = None,
     ) -> AsyncNetworkStream:  # pragma: nocover
         await self._init_backend()
         return await self._backend.connect_unix_socket(
diff --git a/contrib/python/httpcore/httpcore/_backends/base.py b/contrib/python/httpcore/httpcore/_backends/base.py
index 6cadedb5f936..cf55c8b10eb5 100644
--- a/contrib/python/httpcore/httpcore/_backends/base.py
+++ b/contrib/python/httpcore/httpcore/_backends/base.py
@@ -1,3 +1,5 @@
+from __future__ import annotations
+
 import ssl
 import time
 import typing
@@ -10,10 +12,10 @@
 
 
 class NetworkStream:
-    def read(self, max_bytes: int, timeout: typing.Optional[float] = None) -> bytes:
+    def read(self, max_bytes: int, timeout: float | None = None) -> bytes:
         raise NotImplementedError()  # pragma: nocover
 
-    def write(self, buffer: bytes, timeout: typing.Optional[float] = None) -> None:
+    def write(self, buffer: bytes, timeout: float | None = None) -> None:
         raise NotImplementedError()  # pragma: nocover
 
     def close(self) -> None:
@@ -22,9 +24,9 @@ def close(self) -> None:
     def start_tls(
         self,
         ssl_context: ssl.SSLContext,
-        server_hostname: typing.Optional[str] = None,
-        timeout: typing.Optional[float] = None,
-    ) -> "NetworkStream":
+        server_hostname: str | None = None,
+        timeout: float | None = None,
+    ) -> NetworkStream:
         raise NotImplementedError()  # pragma: nocover
 
     def get_extra_info(self, info: str) -> typing.Any:
@@ -36,17 +38,17 @@ def connect_tcp(
         self,
         host: str,
         port: int,
-        timeout: typing.Optional[float] = None,
-        local_address: typing.Optional[str] = None,
-        socket_options: typing.Optional[typing.Iterable[SOCKET_OPTION]] = None,
+        timeout: float | None = None,
+        local_address: str | None = None,
+        socket_options: typing.Iterable[SOCKET_OPTION] | None = None,
     ) -> NetworkStream:
         raise NotImplementedError()  # pragma: nocover
 
     def connect_unix_socket(
         self,
         path: str,
-        timeout: typing.Optional[float] = None,
-        socket_options: typing.Optional[typing.Iterable[SOCKET_OPTION]] = None,
+        timeout: float | None = None,
+        socket_options: typing.Iterable[SOCKET_OPTION] | None = None,
     ) -> NetworkStream:
         raise NotImplementedError()  # pragma: nocover
 
@@ -55,14 +57,10 @@ def sleep(self, seconds: float) -> None:
 
 
 class AsyncNetworkStream:
-    async def read(
-        self, max_bytes: int, timeout: typing.Optional[float] = None
-    ) -> bytes:
+    async def read(self, max_bytes: int, timeout: float | None = None) -> bytes:
         raise NotImplementedError()  # pragma: nocover
 
-    async def write(
-        self, buffer: bytes, timeout: typing.Optional[float] = None
-    ) -> None:
+    async def write(self, buffer: bytes, timeout: float | None = None) -> None:
         raise NotImplementedError()  # pragma: nocover
 
     async def aclose(self) -> None:
@@ -71,9 +69,9 @@ async def aclose(self) -> None:
     async def start_tls(
         self,
         ssl_context: ssl.SSLContext,
-        server_hostname: typing.Optional[str] = None,
-        timeout: typing.Optional[float] = None,
-    ) -> "AsyncNetworkStream":
+        server_hostname: str | None = None,
+        timeout: float | None = None,
+    ) -> AsyncNetworkStream:
         raise NotImplementedError()  # pragma: nocover
 
     def get_extra_info(self, info: str) -> typing.Any:
@@ -85,17 +83,17 @@ async def connect_tcp(
         self,
         host: str,
         port: int,
-        timeout: typing.Optional[float] = None,
-        local_address: typing.Optional[str] = None,
-        socket_options: typing.Optional[typing.Iterable[SOCKET_OPTION]] = None,
+        timeout: float | None = None,
+        local_address: str | None = None,
+        socket_options: typing.Iterable[SOCKET_OPTION] | None = None,
     ) -> AsyncNetworkStream:
         raise NotImplementedError()  # pragma: nocover
 
     async def connect_unix_socket(
         self,
         path: str,
-        timeout: typing.Optional[float] = None,
-        socket_options: typing.Optional[typing.Iterable[SOCKET_OPTION]] = None,
+        timeout: float | None = None,
+        socket_options: typing.Iterable[SOCKET_OPTION] | None = None,
     ) -> AsyncNetworkStream:
         raise NotImplementedError()  # pragma: nocover
 
diff --git a/contrib/python/httpcore/httpcore/_backends/mock.py b/contrib/python/httpcore/httpcore/_backends/mock.py
index f7aefebf5194..9b6edca03d4d 100644
--- a/contrib/python/httpcore/httpcore/_backends/mock.py
+++ b/contrib/python/httpcore/httpcore/_backends/mock.py
@@ -1,6 +1,7 @@
+from __future__ import annotations
+
 import ssl
 import typing
-from typing import Optional
 
 from .._exceptions import ReadError
 from .base import (
@@ -21,19 +22,19 @@ def selected_alpn_protocol(self) -> str:
 
 
 class MockStream(NetworkStream):
-    def __init__(self, buffer: typing.List[bytes], http2: bool = False) -> None:
+    def __init__(self, buffer: list[bytes], http2: bool = False) -> None:
         self._buffer = buffer
         self._http2 = http2
         self._closed = False
 
-    def read(self, max_bytes: int, timeout: Optional[float] = None) -> bytes:
+    def read(self, max_bytes: int, timeout: float | None = None) -> bytes:
         if self._closed:
             raise ReadError("Connection closed")
         if not self._buffer:
             return b""
         return self._buffer.pop(0)
 
-    def write(self, buffer: bytes, timeout: Optional[float] = None) -> None:
+    def write(self, buffer: bytes, timeout: float | None = None) -> None:
         pass
 
     def close(self) -> None:
@@ -42,8 +43,8 @@ def close(self) -> None:
     def start_tls(
         self,
         ssl_context: ssl.SSLContext,
-        server_hostname: Optional[str] = None,
-        timeout: Optional[float] = None,
+        server_hostname: str | None = None,
+        timeout: float | None = None,
     ) -> NetworkStream:
         return self
 
@@ -55,7 +56,7 @@ def __repr__(self) -> str:
 
 
 class MockBackend(NetworkBackend):
-    def __init__(self, buffer: typing.List[bytes], http2: bool = False) -> None:
+    def __init__(self, buffer: list[bytes], http2: bool = False) -> None:
         self._buffer = buffer
         self._http2 = http2
 
@@ -63,17 +64,17 @@ def connect_tcp(
         self,
         host: str,
         port: int,
-        timeout: Optional[float] = None,
-        local_address: Optional[str] = None,
-        socket_options: typing.Optional[typing.Iterable[SOCKET_OPTION]] = None,
+        timeout: float | None = None,
+        local_address: str | None = None,
+        socket_options: typing.Iterable[SOCKET_OPTION] | None = None,
     ) -> NetworkStream:
         return MockStream(list(self._buffer), http2=self._http2)
 
     def connect_unix_socket(
         self,
         path: str,
-        timeout: Optional[float] = None,
-        socket_options: typing.Optional[typing.Iterable[SOCKET_OPTION]] = None,
+        timeout: float | None = None,
+        socket_options: typing.Iterable[SOCKET_OPTION] | None = None,
     ) -> NetworkStream:
         return MockStream(list(self._buffer), http2=self._http2)
 
@@ -82,19 +83,19 @@ def sleep(self, seconds: float) -> None:
 
 
 class AsyncMockStream(AsyncNetworkStream):
-    def __init__(self, buffer: typing.List[bytes], http2: bool = False) -> None:
+    def __init__(self, buffer: list[bytes], http2: bool = False) -> None:
         self._buffer = buffer
         self._http2 = http2
         self._closed = False
 
-    async def read(self, max_bytes: int, timeout: Optional[float] = None) -> bytes:
+    async def read(self, max_bytes: int, timeout: float | None = None) -> bytes:
         if self._closed:
             raise ReadError("Connection closed")
         if not self._buffer:
             return b""
         return self._buffer.pop(0)
 
-    async def write(self, buffer: bytes, timeout: Optional[float] = None) -> None:
+    async def write(self, buffer: bytes, timeout: float | None = None) -> None:
         pass
 
     async def aclose(self) -> None:
@@ -103,8 +104,8 @@ async def aclose(self) -> None:
     async def start_tls(
         self,
         ssl_context: ssl.SSLContext,
-        server_hostname: Optional[str] = None,
-        timeout: Optional[float] = None,
+        server_hostname: str | None = None,
+        timeout: float | None = None,
     ) -> AsyncNetworkStream:
         return self
 
@@ -116,7 +117,7 @@ def __repr__(self) -> str:
 
 
 class AsyncMockBackend(AsyncNetworkBackend):
-    def __init__(self, buffer: typing.List[bytes], http2: bool = False) -> None:
+    def __init__(self, buffer: list[bytes], http2: bool = False) -> None:
         self._buffer = buffer
         self._http2 = http2
 
@@ -124,17 +125,17 @@ async def connect_tcp(
         self,
         host: str,
         port: int,
-        timeout: Optional[float] = None,
-        local_address: Optional[str] = None,
-        socket_options: typing.Optional[typing.Iterable[SOCKET_OPTION]] = None,
+        timeout: float | None = None,
+        local_address: str | None = None,
+        socket_options: typing.Iterable[SOCKET_OPTION] | None = None,
     ) -> AsyncNetworkStream:
         return AsyncMockStream(list(self._buffer), http2=self._http2)
 
     async def connect_unix_socket(
         self,
         path: str,
-        timeout: Optional[float] = None,
-        socket_options: typing.Optional[typing.Iterable[SOCKET_OPTION]] = None,
+        timeout: float | None = None,
+        socket_options: typing.Iterable[SOCKET_OPTION] | None = None,
     ) -> AsyncNetworkStream:
         return AsyncMockStream(list(self._buffer), http2=self._http2)
 
diff --git a/contrib/python/httpcore/httpcore/_backends/sync.py b/contrib/python/httpcore/httpcore/_backends/sync.py
index 7b7b417dc198..4018a09c6fb1 100644
--- a/contrib/python/httpcore/httpcore/_backends/sync.py
+++ b/contrib/python/httpcore/httpcore/_backends/sync.py
@@ -1,8 +1,10 @@
+from __future__ import annotations
+
+import functools
 import socket
 import ssl
 import sys
 import typing
-from functools import partial
 
 from .._exceptions import (
     ConnectError,
@@ -33,8 +35,8 @@ def __init__(
         self,
         sock: socket.socket,
         ssl_context: ssl.SSLContext,
-        server_hostname: typing.Optional[str] = None,
-        timeout: typing.Optional[float] = None,
+        server_hostname: str | None = None,
+        timeout: float | None = None,
     ):
         self._sock = sock
         self._incoming = ssl.MemoryBIO()
@@ -74,20 +76,20 @@ def _perform_io(
             if errno is None:
                 return ret
 
-    def read(self, max_bytes: int, timeout: typing.Optional[float] = None) -> bytes:
+    def read(self, max_bytes: int, timeout: float | None = None) -> bytes:
         exc_map: ExceptionMapping = {socket.timeout: ReadTimeout, OSError: ReadError}
         with map_exceptions(exc_map):
             self._sock.settimeout(timeout)
             return typing.cast(
-                bytes, self._perform_io(partial(self.ssl_obj.read, max_bytes))
+                bytes, self._perform_io(functools.partial(self.ssl_obj.read, max_bytes))
             )
 
-    def write(self, buffer: bytes, timeout: typing.Optional[float] = None) -> None:
+    def write(self, buffer: bytes, timeout: float | None = None) -> None:
         exc_map: ExceptionMapping = {socket.timeout: WriteTimeout, OSError: WriteError}
         with map_exceptions(exc_map):
             self._sock.settimeout(timeout)
             while buffer:
-                nsent = self._perform_io(partial(self.ssl_obj.write, buffer))
+                nsent = self._perform_io(functools.partial(self.ssl_obj.write, buffer))
                 buffer = buffer[nsent:]
 
     def close(self) -> None:
@@ -96,9 +98,9 @@ def close(self) -> None:
     def start_tls(
         self,
         ssl_context: ssl.SSLContext,
-        server_hostname: typing.Optional[str] = None,
-        timeout: typing.Optional[float] = None,
-    ) -> "NetworkStream":
+        server_hostname: str | None = None,
+        timeout: float | None = None,
+    ) -> NetworkStream:
         raise NotImplementedError()
 
     def get_extra_info(self, info: str) -> typing.Any:
@@ -119,13 +121,13 @@ class SyncStream(NetworkStream):
     def __init__(self, sock: socket.socket) -> None:
         self._sock = sock
 
-    def read(self, max_bytes: int, timeout: typing.Optional[float] = None) -> bytes:
+    def read(self, max_bytes: int, timeout: float | None = None) -> bytes:
         exc_map: ExceptionMapping = {socket.timeout: ReadTimeout, OSError: ReadError}
         with map_exceptions(exc_map):
             self._sock.settimeout(timeout)
             return self._sock.recv(max_bytes)
 
-    def write(self, buffer: bytes, timeout: typing.Optional[float] = None) -> None:
+    def write(self, buffer: bytes, timeout: float | None = None) -> None:
         if not buffer:
             return
 
@@ -142,8 +144,8 @@ def close(self) -> None:
     def start_tls(
         self,
         ssl_context: ssl.SSLContext,
-        server_hostname: typing.Optional[str] = None,
-        timeout: typing.Optional[float] = None,
+        server_hostname: str | None = None,
+        timeout: float | None = None,
     ) -> NetworkStream:
         exc_map: ExceptionMapping = {
             socket.timeout: ConnectTimeout,
@@ -187,9 +189,9 @@ def connect_tcp(
         self,
         host: str,
         port: int,
-        timeout: typing.Optional[float] = None,
-        local_address: typing.Optional[str] = None,
-        socket_options: typing.Optional[typing.Iterable[SOCKET_OPTION]] = None,
+        timeout: float | None = None,
+        local_address: str | None = None,
+        socket_options: typing.Iterable[SOCKET_OPTION] | None = None,
     ) -> NetworkStream:
         # Note that we automatically include `TCP_NODELAY`
         # in addition to any other custom socket options.
@@ -216,8 +218,8 @@ def connect_tcp(
     def connect_unix_socket(
         self,
         path: str,
-        timeout: typing.Optional[float] = None,
-        socket_options: typing.Optional[typing.Iterable[SOCKET_OPTION]] = None,
+        timeout: float | None = None,
+        socket_options: typing.Iterable[SOCKET_OPTION] | None = None,
     ) -> NetworkStream:  # pragma: nocover
         if sys.platform == "win32":
             raise RuntimeError(
diff --git a/contrib/python/httpcore/httpcore/_backends/trio.py b/contrib/python/httpcore/httpcore/_backends/trio.py
index b1626d28e2de..6f53f5f2a025 100644
--- a/contrib/python/httpcore/httpcore/_backends/trio.py
+++ b/contrib/python/httpcore/httpcore/_backends/trio.py
@@ -1,3 +1,5 @@
+from __future__ import annotations
+
 import ssl
 import typing
 
@@ -20,9 +22,7 @@ class TrioStream(AsyncNetworkStream):
     def __init__(self, stream: trio.abc.Stream) -> None:
         self._stream = stream
 
-    async def read(
-        self, max_bytes: int, timeout: typing.Optional[float] = None
-    ) -> bytes:
+    async def read(self, max_bytes: int, timeout: float | None = None) -> bytes:
         timeout_or_inf = float("inf") if timeout is None else timeout
         exc_map: ExceptionMapping = {
             trio.TooSlowError: ReadTimeout,
@@ -34,9 +34,7 @@ async def read(
                 data: bytes = await self._stream.receive_some(max_bytes=max_bytes)
                 return data
 
-    async def write(
-        self, buffer: bytes, timeout: typing.Optional[float] = None
-    ) -> None:
+    async def write(self, buffer: bytes, timeout: float | None = None) -> None:
         if not buffer:
             return
 
@@ -56,8 +54,8 @@ async def aclose(self) -> None:
     async def start_tls(
         self,
         ssl_context: ssl.SSLContext,
-        server_hostname: typing.Optional[str] = None,
-        timeout: typing.Optional[float] = None,
+        server_hostname: str | None = None,
+        timeout: float | None = None,
     ) -> AsyncNetworkStream:
         timeout_or_inf = float("inf") if timeout is None else timeout
         exc_map: ExceptionMapping = {
@@ -113,9 +111,9 @@ async def connect_tcp(
         self,
         host: str,
         port: int,
-        timeout: typing.Optional[float] = None,
-        local_address: typing.Optional[str] = None,
-        socket_options: typing.Optional[typing.Iterable[SOCKET_OPTION]] = None,
+        timeout: float | None = None,
+        local_address: str | None = None,
+        socket_options: typing.Iterable[SOCKET_OPTION] | None = None,
     ) -> AsyncNetworkStream:
         # By default for TCP sockets, trio enables TCP_NODELAY.
         # https://trio.readthedocs.io/en/stable/reference-io.html#trio.SocketStream
@@ -139,8 +137,8 @@ async def connect_tcp(
     async def connect_unix_socket(
         self,
         path: str,
-        timeout: typing.Optional[float] = None,
-        socket_options: typing.Optional[typing.Iterable[SOCKET_OPTION]] = None,
+        timeout: float | None = None,
+        socket_options: typing.Iterable[SOCKET_OPTION] | None = None,
     ) -> AsyncNetworkStream:  # pragma: nocover
         if socket_options is None:
             socket_options = []
diff --git a/contrib/python/httpcore/httpcore/_exceptions.py b/contrib/python/httpcore/httpcore/_exceptions.py
index 81e7fc61ddfe..bc28d44f55bd 100644
--- a/contrib/python/httpcore/httpcore/_exceptions.py
+++ b/contrib/python/httpcore/httpcore/_exceptions.py
@@ -1,11 +1,11 @@
 import contextlib
-from typing import Iterator, Mapping, Type
+import typing
 
-ExceptionMapping = Mapping[Type[Exception], Type[Exception]]
+ExceptionMapping = typing.Mapping[typing.Type[Exception], typing.Type[Exception]]
 
 
 @contextlib.contextmanager
-def map_exceptions(map: ExceptionMapping) -> Iterator[None]:
+def map_exceptions(map: ExceptionMapping) -> typing.Iterator[None]:
     try:
         yield
     except Exception as exc:  # noqa: PIE786
diff --git a/contrib/python/httpcore/httpcore/_models.py b/contrib/python/httpcore/httpcore/_models.py
index dadee79f695e..8a65f13347d6 100644
--- a/contrib/python/httpcore/httpcore/_models.py
+++ b/contrib/python/httpcore/httpcore/_models.py
@@ -1,30 +1,22 @@
-from typing import (
-    Any,
-    AsyncIterable,
-    AsyncIterator,
-    Iterable,
-    Iterator,
-    List,
-    Mapping,
-    MutableMapping,
-    Optional,
-    Sequence,
-    Tuple,
-    Union,
-)
-from urllib.parse import urlparse
+from __future__ import annotations
+
+import base64
+import ssl
+import typing
+import urllib.parse
 
 # Functions for typechecking...
 
 
-HeadersAsSequence = Sequence[Tuple[Union[bytes, str], Union[bytes, str]]]
-HeadersAsMapping = Mapping[Union[bytes, str], Union[bytes, str]]
-HeaderTypes = Union[HeadersAsSequence, HeadersAsMapping, None]
+ByteOrStr = typing.Union[bytes, str]
+HeadersAsSequence = typing.Sequence[typing.Tuple[ByteOrStr, ByteOrStr]]
+HeadersAsMapping = typing.Mapping[ByteOrStr, ByteOrStr]
+HeaderTypes = typing.Union[HeadersAsSequence, HeadersAsMapping, None]
 
-Extensions = MutableMapping[str, Any]
+Extensions = typing.MutableMapping[str, typing.Any]
 
 
-def enforce_bytes(value: Union[bytes, str], *, name: str) -> bytes:
+def enforce_bytes(value: bytes | str, *, name: str) -> bytes:
     """
     Any arguments that are ultimately represented as bytes can be specified
     either as bytes or as strings.
@@ -45,7 +37,7 @@ def enforce_bytes(value: Union[bytes, str], *, name: str) -> bytes:
     raise TypeError(f"{name} must be bytes or str, but got {seen_type}.")
 
 
-def enforce_url(value: Union["URL", bytes, str], *, name: str) -> "URL":
+def enforce_url(value: URL | bytes | str, *, name: str) -> URL:
     """
     Type check for URL parameters.
     """
@@ -59,15 +51,15 @@ def enforce_url(value: Union["URL", bytes, str], *, name: str) -> "URL":
 
 
 def enforce_headers(
-    value: Union[HeadersAsMapping, HeadersAsSequence, None] = None, *, name: str
-) -> List[Tuple[bytes, bytes]]:
+    value: HeadersAsMapping | HeadersAsSequence | None = None, *, name: str
+) -> list[tuple[bytes, bytes]]:
     """
     Convienence function that ensure all items in request or response headers
     are either bytes or strings in the plain ASCII range.
     """
     if value is None:
         return []
-    elif isinstance(value, Mapping):
+    elif isinstance(value, typing.Mapping):
         return [
             (
                 enforce_bytes(k, name="header name"),
@@ -75,7 +67,7 @@ def enforce_headers(
             )
             for k, v in value.items()
         ]
-    elif isinstance(value, Sequence):
+    elif isinstance(value, typing.Sequence):
         return [
             (
                 enforce_bytes(k, name="header name"),
@@ -91,8 +83,10 @@ def enforce_headers(
 
 
 def enforce_stream(
-    value: Union[bytes, Iterable[bytes], AsyncIterable[bytes], None], *, name: str
-) -> Union[Iterable[bytes], AsyncIterable[bytes]]:
+    value: bytes | typing.Iterable[bytes] | typing.AsyncIterable[bytes] | None,
+    *,
+    name: str,
+) -> typing.Iterable[bytes] | typing.AsyncIterable[bytes]:
     if value is None:
         return ByteStream(b"")
     elif isinstance(value, bytes):
@@ -113,11 +107,11 @@ def enforce_stream(
 
 
 def include_request_headers(
-    headers: List[Tuple[bytes, bytes]],
+    headers: list[tuple[bytes, bytes]],
     *,
     url: "URL",
-    content: Union[None, bytes, Iterable[bytes], AsyncIterable[bytes]],
-) -> List[Tuple[bytes, bytes]]:
+    content: None | bytes | typing.Iterable[bytes] | typing.AsyncIterable[bytes],
+) -> list[tuple[bytes, bytes]]:
     headers_set = set(k.lower() for k, v in headers)
 
     if b"host" not in headers_set:
@@ -154,10 +148,10 @@ class ByteStream:
     def __init__(self, content: bytes) -> None:
         self._content = content
 
-    def __iter__(self) -> Iterator[bytes]:
+    def __iter__(self) -> typing.Iterator[bytes]:
         yield self._content
 
-    async def __aiter__(self) -> AsyncIterator[bytes]:
+    async def __aiter__(self) -> typing.AsyncIterator[bytes]:
         yield self._content
 
     def __repr__(self) -> str:
@@ -170,7 +164,7 @@ def __init__(self, scheme: bytes, host: bytes, port: int) -> None:
         self.host = host
         self.port = port
 
-    def __eq__(self, other: Any) -> bool:
+    def __eq__(self, other: typing.Any) -> bool:
         return (
             isinstance(other, Origin)
             and self.scheme == other.scheme
@@ -254,12 +248,12 @@ class URL:
 
     def __init__(
         self,
-        url: Union[bytes, str] = "",
+        url: bytes | str = "",
         *,
-        scheme: Union[bytes, str] = b"",
-        host: Union[bytes, str] = b"",
-        port: Optional[int] = None,
-        target: Union[bytes, str] = b"",
+        scheme: bytes | str = b"",
+        host: bytes | str = b"",
+        port: int | None = None,
+        target: bytes | str = b"",
     ) -> None:
         """
         Parameters:
@@ -271,7 +265,7 @@ def __init__(
             target: The target of the HTTP request. Such as `"/items?search=red"`.
         """
         if url:
-            parsed = urlparse(enforce_bytes(url, name="url"))
+            parsed = urllib.parse.urlparse(enforce_bytes(url, name="url"))
             self.scheme = parsed.scheme
             self.host = parsed.hostname or b""
             self.port = parsed.port
@@ -292,12 +286,13 @@ def origin(self) -> Origin:
             b"ws": 80,
             b"wss": 443,
             b"socks5": 1080,
+            b"socks5h": 1080,
         }[self.scheme]
         return Origin(
             scheme=self.scheme, host=self.host, port=self.port or default_port
         )
 
-    def __eq__(self, other: Any) -> bool:
+    def __eq__(self, other: typing.Any) -> bool:
         return (
             isinstance(other, URL)
             and other.scheme == self.scheme
@@ -325,12 +320,15 @@ class Request:
 
     def __init__(
         self,
-        method: Union[bytes, str],
-        url: Union[URL, bytes, str],
+        method: bytes | str,
+        url: URL | bytes | str,
         *,
         headers: HeaderTypes = None,
-        content: Union[bytes, Iterable[bytes], AsyncIterable[bytes], None] = None,
-        extensions: Optional[Extensions] = None,
+        content: bytes
+        | typing.Iterable[bytes]
+        | typing.AsyncIterable[bytes]
+        | None = None,
+        extensions: Extensions | None = None,
     ) -> None:
         """
         Parameters:
@@ -345,11 +343,11 @@ def __init__(
         """
         self.method: bytes = enforce_bytes(method, name="method")
         self.url: URL = enforce_url(url, name="url")
-        self.headers: List[Tuple[bytes, bytes]] = enforce_headers(
+        self.headers: list[tuple[bytes, bytes]] = enforce_headers(
             headers, name="headers"
         )
-        self.stream: Union[Iterable[bytes], AsyncIterable[bytes]] = enforce_stream(
-            content, name="content"
+        self.stream: typing.Iterable[bytes] | typing.AsyncIterable[bytes] = (
+            enforce_stream(content, name="content")
         )
         self.extensions = {} if extensions is None else extensions
 
@@ -375,8 +373,11 @@ def __init__(
         status: int,
         *,
         headers: HeaderTypes = None,
-        content: Union[bytes, Iterable[bytes], AsyncIterable[bytes], None] = None,
-        extensions: Optional[Extensions] = None,
+        content: bytes
+        | typing.Iterable[bytes]
+        | typing.AsyncIterable[bytes]
+        | None = None,
+        extensions: Extensions | None = None,
     ) -> None:
         """
         Parameters:
@@ -388,11 +389,11 @@ def __init__(
                 `"reason_phrase"`, and `"network_stream"`.
         """
         self.status: int = status
-        self.headers: List[Tuple[bytes, bytes]] = enforce_headers(
+        self.headers: list[tuple[bytes, bytes]] = enforce_headers(
             headers, name="headers"
         )
-        self.stream: Union[Iterable[bytes], AsyncIterable[bytes]] = enforce_stream(
-            content, name="content"
+        self.stream: typing.Iterable[bytes] | typing.AsyncIterable[bytes] = (
+            enforce_stream(content, name="content")
         )
         self.extensions = {} if extensions is None else extensions
 
@@ -401,7 +402,7 @@ def __init__(
     @property
     def content(self) -> bytes:
         if not hasattr(self, "_content"):
-            if isinstance(self.stream, Iterable):
+            if isinstance(self.stream, typing.Iterable):
                 raise RuntimeError(
                     "Attempted to access 'response.content' on a streaming response. "
                     "Call 'response.read()' first."
@@ -419,7 +420,7 @@ def __repr__(self) -> str:
     # Sync interface...
 
     def read(self) -> bytes:
-        if not isinstance(self.stream, Iterable):  # pragma: nocover
+        if not isinstance(self.stream, typing.Iterable):  # pragma: nocover
             raise RuntimeError(
                 "Attempted to read an asynchronous response using 'response.read()'. "
                 "You should use 'await response.aread()' instead."
@@ -428,8 +429,8 @@ def read(self) -> bytes:
             self._content = b"".join([part for part in self.iter_stream()])
         return self._content
 
-    def iter_stream(self) -> Iterator[bytes]:
-        if not isinstance(self.stream, Iterable):  # pragma: nocover
+    def iter_stream(self) -> typing.Iterator[bytes]:
+        if not isinstance(self.stream, typing.Iterable):  # pragma: nocover
             raise RuntimeError(
                 "Attempted to stream an asynchronous response using 'for ... in "
                 "response.iter_stream()'. "
@@ -444,7 +445,7 @@ def iter_stream(self) -> Iterator[bytes]:
             yield chunk
 
     def close(self) -> None:
-        if not isinstance(self.stream, Iterable):  # pragma: nocover
+        if not isinstance(self.stream, typing.Iterable):  # pragma: nocover
             raise RuntimeError(
                 "Attempted to close an asynchronous response using 'response.close()'. "
                 "You should use 'await response.aclose()' instead."
@@ -455,7 +456,7 @@ def close(self) -> None:
     # Async interface...
 
     async def aread(self) -> bytes:
-        if not isinstance(self.stream, AsyncIterable):  # pragma: nocover
+        if not isinstance(self.stream, typing.AsyncIterable):  # pragma: nocover
             raise RuntimeError(
                 "Attempted to read an synchronous response using "
                 "'await response.aread()'. "
@@ -465,8 +466,8 @@ async def aread(self) -> bytes:
             self._content = b"".join([part async for part in self.aiter_stream()])
         return self._content
 
-    async def aiter_stream(self) -> AsyncIterator[bytes]:
-        if not isinstance(self.stream, AsyncIterable):  # pragma: nocover
+    async def aiter_stream(self) -> typing.AsyncIterator[bytes]:
+        if not isinstance(self.stream, typing.AsyncIterable):  # pragma: nocover
             raise RuntimeError(
                 "Attempted to stream an synchronous response using 'async for ... in "
                 "response.aiter_stream()'. "
@@ -482,7 +483,7 @@ async def aiter_stream(self) -> AsyncIterator[bytes]:
             yield chunk
 
     async def aclose(self) -> None:
-        if not isinstance(self.stream, AsyncIterable):  # pragma: nocover
+        if not isinstance(self.stream, typing.AsyncIterable):  # pragma: nocover
             raise RuntimeError(
                 "Attempted to close a synchronous response using "
                 "'await response.aclose()'. "
@@ -490,3 +491,26 @@ async def aclose(self) -> None:
             )
         if hasattr(self.stream, "aclose"):
             await self.stream.aclose()
+
+
+class Proxy:
+    def __init__(
+        self,
+        url: URL | bytes | str,
+        auth: tuple[bytes | str, bytes | str] | None = None,
+        headers: HeadersAsMapping | HeadersAsSequence | None = None,
+        ssl_context: ssl.SSLContext | None = None,
+    ):
+        self.url = enforce_url(url, name="url")
+        self.headers = enforce_headers(headers, name="headers")
+        self.ssl_context = ssl_context
+
+        if auth is not None:
+            username = enforce_bytes(auth[0], name="auth")
+            password = enforce_bytes(auth[1], name="auth")
+            userpass = username + b":" + password
+            authorization = b"Basic " + base64.b64encode(userpass)
+            self.auth: tuple[bytes, bytes] | None = (username, password)
+            self.headers = [(b"Proxy-Authorization", authorization)] + self.headers
+        else:
+            self.auth = None
diff --git a/contrib/python/httpcore/httpcore/_sync/connection.py b/contrib/python/httpcore/httpcore/_sync/connection.py
index c3890f340c2f..363f8be819d2 100644
--- a/contrib/python/httpcore/httpcore/_sync/connection.py
+++ b/contrib/python/httpcore/httpcore/_sync/connection.py
@@ -1,8 +1,10 @@
+from __future__ import annotations
+
 import itertools
 import logging
 import ssl
-from types import TracebackType
-from typing import Iterable, Iterator, Optional, Type
+import types
+import typing
 
 from .._backends.sync import SyncBackend
 from .._backends.base import SOCKET_OPTION, NetworkBackend, NetworkStream
@@ -20,7 +22,7 @@
 logger = logging.getLogger("httpcore.connection")
 
 
-def exponential_backoff(factor: float) -> Iterator[float]:
+def exponential_backoff(factor: float) -> typing.Iterator[float]:
     """
     Generate a geometric sequence that has a ratio of 2 and starts with 0.
 
@@ -37,15 +39,15 @@ class HTTPConnection(ConnectionInterface):
     def __init__(
         self,
         origin: Origin,
-        ssl_context: Optional[ssl.SSLContext] = None,
-        keepalive_expiry: Optional[float] = None,
+        ssl_context: ssl.SSLContext | None = None,
+        keepalive_expiry: float | None = None,
         http1: bool = True,
         http2: bool = False,
         retries: int = 0,
-        local_address: Optional[str] = None,
-        uds: Optional[str] = None,
-        network_backend: Optional[NetworkBackend] = None,
-        socket_options: Optional[Iterable[SOCKET_OPTION]] = None,
+        local_address: str | None = None,
+        uds: str | None = None,
+        network_backend: NetworkBackend | None = None,
+        socket_options: typing.Iterable[SOCKET_OPTION] | None = None,
     ) -> None:
         self._origin = origin
         self._ssl_context = ssl_context
@@ -59,7 +61,7 @@ def __init__(
         self._network_backend: NetworkBackend = (
             SyncBackend() if network_backend is None else network_backend
         )
-        self._connection: Optional[ConnectionInterface] = None
+        self._connection: ConnectionInterface | None = None
         self._connect_failed: bool = False
         self._request_lock = Lock()
         self._socket_options = socket_options
@@ -208,13 +210,13 @@ def __repr__(self) -> str:
     # These context managers are not used in the standard flow, but are
     # useful for testing or working with connection instances directly.
 
-    def __enter__(self) -> "HTTPConnection":
+    def __enter__(self) -> HTTPConnection:
         return self
 
     def __exit__(
         self,
-        exc_type: Optional[Type[BaseException]] = None,
-        exc_value: Optional[BaseException] = None,
-        traceback: Optional[TracebackType] = None,
+        exc_type: type[BaseException] | None = None,
+        exc_value: BaseException | None = None,
+        traceback: types.TracebackType | None = None,
     ) -> None:
         self.close()
diff --git a/contrib/python/httpcore/httpcore/_sync/connection_pool.py b/contrib/python/httpcore/httpcore/_sync/connection_pool.py
index 01bec59e8813..9ccfa53e597a 100644
--- a/contrib/python/httpcore/httpcore/_sync/connection_pool.py
+++ b/contrib/python/httpcore/httpcore/_sync/connection_pool.py
@@ -1,12 +1,14 @@
+from __future__ import annotations
+
 import ssl
 import sys
-from types import TracebackType
-from typing import Iterable, Iterator, Iterable, List, Optional, Type
+import types
+import typing
 
 from .._backends.sync import SyncBackend
 from .._backends.base import SOCKET_OPTION, NetworkBackend
 from .._exceptions import ConnectionNotAvailable, UnsupportedProtocol
-from .._models import Origin, Request, Response
+from .._models import Origin, Proxy, Request, Response
 from .._synchronization import Event, ShieldCancellation, ThreadLock
 from .connection import HTTPConnection
 from .interfaces import ConnectionInterface, RequestInterface
@@ -15,12 +17,10 @@
 class PoolRequest:
     def __init__(self, request: Request) -> None:
         self.request = request
-        self.connection: Optional[ConnectionInterface] = None
+        self.connection: ConnectionInterface | None = None
         self._connection_acquired = Event()
 
-    def assign_to_connection(
-        self, connection: Optional[ConnectionInterface]
-    ) -> None:
+    def assign_to_connection(self, connection: ConnectionInterface | None) -> None:
         self.connection = connection
         self._connection_acquired.set()
 
@@ -29,7 +29,7 @@ def clear_connection(self) -> None:
         self._connection_acquired = Event()
 
     def wait_for_connection(
-        self, timeout: Optional[float] = None
+        self, timeout: float | None = None
     ) -> ConnectionInterface:
         if self.connection is None:
             self._connection_acquired.wait(timeout=timeout)
@@ -47,17 +47,18 @@ class ConnectionPool(RequestInterface):
 
     def __init__(
         self,
-        ssl_context: Optional[ssl.SSLContext] = None,
-        max_connections: Optional[int] = 10,
-        max_keepalive_connections: Optional[int] = None,
-        keepalive_expiry: Optional[float] = None,
+        ssl_context: ssl.SSLContext | None = None,
+        proxy: Proxy | None = None,
+        max_connections: int | None = 10,
+        max_keepalive_connections: int | None = None,
+        keepalive_expiry: float | None = None,
         http1: bool = True,
         http2: bool = False,
         retries: int = 0,
-        local_address: Optional[str] = None,
-        uds: Optional[str] = None,
-        network_backend: Optional[NetworkBackend] = None,
-        socket_options: Optional[Iterable[SOCKET_OPTION]] = None,
+        local_address: str | None = None,
+        uds: str | None = None,
+        network_backend: NetworkBackend | None = None,
+        socket_options: typing.Iterable[SOCKET_OPTION] | None = None,
     ) -> None:
         """
         A connection pool for making HTTP requests.
@@ -89,7 +90,7 @@ def __init__(
              in the TCP socket when the connection was established.
         """
         self._ssl_context = ssl_context
-
+        self._proxy = proxy
         self._max_connections = (
             sys.maxsize if max_connections is None else max_connections
         )
@@ -116,8 +117,8 @@ def __init__(
 
         # The mutable state on a connection pool is the queue of incoming requests,
         # and the set of connections that are servicing those requests.
-        self._connections: List[ConnectionInterface] = []
-        self._requests: List[PoolRequest] = []
+        self._connections: list[ConnectionInterface] = []
+        self._requests: list[PoolRequest] = []
 
         # We only mutate the state of the connection pool within an 'optional_thread_lock'
         # context. This holds a threading lock unless we're running in async mode,
@@ -125,6 +126,45 @@ def __init__(
         self._optional_thread_lock = ThreadLock()
 
     def create_connection(self, origin: Origin) -> ConnectionInterface:
+        if self._proxy is not None:
+            if self._proxy.url.scheme in (b"socks5", b"socks5h"):
+                from .socks_proxy import Socks5Connection
+
+                return Socks5Connection(
+                    proxy_origin=self._proxy.url.origin,
+                    proxy_auth=self._proxy.auth,
+                    remote_origin=origin,
+                    ssl_context=self._ssl_context,
+                    keepalive_expiry=self._keepalive_expiry,
+                    http1=self._http1,
+                    http2=self._http2,
+                    network_backend=self._network_backend,
+                )
+            elif origin.scheme == b"http":
+                from .http_proxy import ForwardHTTPConnection
+
+                return ForwardHTTPConnection(
+                    proxy_origin=self._proxy.url.origin,
+                    proxy_headers=self._proxy.headers,
+                    proxy_ssl_context=self._proxy.ssl_context,
+                    remote_origin=origin,
+                    keepalive_expiry=self._keepalive_expiry,
+                    network_backend=self._network_backend,
+                )
+            from .http_proxy import TunnelHTTPConnection
+
+            return TunnelHTTPConnection(
+                proxy_origin=self._proxy.url.origin,
+                proxy_headers=self._proxy.headers,
+                proxy_ssl_context=self._proxy.ssl_context,
+                remote_origin=origin,
+                ssl_context=self._ssl_context,
+                keepalive_expiry=self._keepalive_expiry,
+                http1=self._http1,
+                http2=self._http2,
+                network_backend=self._network_backend,
+            )
+
         return HTTPConnection(
             origin=origin,
             ssl_context=self._ssl_context,
@@ -139,7 +179,7 @@ def create_connection(self, origin: Origin) -> ConnectionInterface:
         )
 
     @property
-    def connections(self) -> List[ConnectionInterface]:
+    def connections(self) -> list[ConnectionInterface]:
         """
         Return a list of the connections currently in the pool.
 
@@ -217,7 +257,7 @@ def handle_request(self, request: Request) -> Response:
 
         # Return the response. Note that in this case we still have to manage
         # the point at which the response is closed.
-        assert isinstance(response.stream, Iterable)
+        assert isinstance(response.stream, typing.Iterable)
         return Response(
             status=response.status,
             headers=response.headers,
@@ -227,7 +267,7 @@ def handle_request(self, request: Request) -> Response:
             extensions=response.extensions,
         )
 
-    def _assign_requests_to_connections(self) -> List[ConnectionInterface]:
+    def _assign_requests_to_connections(self) -> list[ConnectionInterface]:
         """
         Manage the state of the connection pool, assigning incoming
         requests to connections as available.
@@ -298,7 +338,7 @@ def _assign_requests_to_connections(self) -> List[ConnectionInterface]:
 
         return closing_connections
 
-    def _close_connections(self, closing: List[ConnectionInterface]) -> None:
+    def _close_connections(self, closing: list[ConnectionInterface]) -> None:
         # Close connections which have been removed from the pool.
         with ShieldCancellation():
             for connection in closing:
@@ -312,14 +352,14 @@ def close(self) -> None:
             self._connections = []
         self._close_connections(closing_connections)
 
-    def __enter__(self) -> "ConnectionPool":
+    def __enter__(self) -> ConnectionPool:
         return self
 
     def __exit__(
         self,
-        exc_type: Optional[Type[BaseException]] = None,
-        exc_value: Optional[BaseException] = None,
-        traceback: Optional[TracebackType] = None,
+        exc_type: type[BaseException] | None = None,
+        exc_value: BaseException | None = None,
+        traceback: types.TracebackType | None = None,
     ) -> None:
         self.close()
 
@@ -349,7 +389,7 @@ def __repr__(self) -> str:
 class PoolByteStream:
     def __init__(
         self,
-        stream: Iterable[bytes],
+        stream: typing.Iterable[bytes],
         pool_request: PoolRequest,
         pool: ConnectionPool,
     ) -> None:
@@ -358,7 +398,7 @@ def __init__(
         self._pool = pool
         self._closed = False
 
-    def __iter__(self) -> Iterator[bytes]:
+    def __iter__(self) -> typing.Iterator[bytes]:
         try:
             for part in self._stream:
                 yield part
diff --git a/contrib/python/httpcore/httpcore/_sync/http11.py b/contrib/python/httpcore/httpcore/_sync/http11.py
index a74ff8e8092f..ebd3a97480c7 100644
--- a/contrib/python/httpcore/httpcore/_sync/http11.py
+++ b/contrib/python/httpcore/httpcore/_sync/http11.py
@@ -1,18 +1,11 @@
+from __future__ import annotations
+
 import enum
 import logging
 import ssl
 import time
-from types import TracebackType
-from typing import (
-    Any,
-    Iterable,
-    Iterator,
-    List,
-    Optional,
-    Tuple,
-    Type,
-    Union,
-)
+import types
+import typing
 
 import h11
 
@@ -33,7 +26,7 @@
 
 
 # A subset of `h11.Event` types supported by `_send_event`
-H11SendEvent = Union[
+H11SendEvent = typing.Union[
     h11.Request,
     h11.Data,
     h11.EndOfMessage,
@@ -55,12 +48,12 @@ def __init__(
         self,
         origin: Origin,
         stream: NetworkStream,
-        keepalive_expiry: Optional[float] = None,
+        keepalive_expiry: float | None = None,
     ) -> None:
         self._origin = origin
         self._network_stream = stream
-        self._keepalive_expiry: Optional[float] = keepalive_expiry
-        self._expire_at: Optional[float] = None
+        self._keepalive_expiry: float | None = keepalive_expiry
+        self._expire_at: float | None = None
         self._state = HTTPConnectionState.NEW
         self._state_lock = Lock()
         self._request_count = 0
@@ -160,16 +153,14 @@ def _send_request_body(self, request: Request) -> None:
         timeouts = request.extensions.get("timeout", {})
         timeout = timeouts.get("write", None)
 
-        assert isinstance(request.stream, Iterable)
+        assert isinstance(request.stream, typing.Iterable)
         for chunk in request.stream:
             event = h11.Data(data=chunk)
             self._send_event(event, timeout=timeout)
 
         self._send_event(h11.EndOfMessage(), timeout=timeout)
 
-    def _send_event(
-        self, event: h11.Event, timeout: Optional[float] = None
-    ) -> None:
+    def _send_event(self, event: h11.Event, timeout: float | None = None) -> None:
         bytes_to_send = self._h11_state.send(event)
         if bytes_to_send is not None:
             self._network_stream.write(bytes_to_send, timeout=timeout)
@@ -178,7 +169,7 @@ def _send_event(
 
     def _receive_response_headers(
         self, request: Request
-    ) -> Tuple[bytes, int, bytes, List[Tuple[bytes, bytes]], bytes]:
+    ) -> tuple[bytes, int, bytes, list[tuple[bytes, bytes]], bytes]:
         timeouts = request.extensions.get("timeout", {})
         timeout = timeouts.get("read", None)
 
@@ -202,7 +193,9 @@ def _receive_response_headers(
 
         return http_version, event.status_code, event.reason, headers, trailing_data
 
-    def _receive_response_body(self, request: Request) -> Iterator[bytes]:
+    def _receive_response_body(
+        self, request: Request
+    ) -> typing.Iterator[bytes]:
         timeouts = request.extensions.get("timeout", {})
         timeout = timeouts.get("read", None)
 
@@ -214,8 +207,8 @@ def _receive_response_body(self, request: Request) -> Iterator[bytes]:
                 break
 
     def _receive_event(
-        self, timeout: Optional[float] = None
-    ) -> Union[h11.Event, Type[h11.PAUSED]]:
+        self, timeout: float | None = None
+    ) -> h11.Event | type[h11.PAUSED]:
         while True:
             with map_exceptions({h11.RemoteProtocolError: RemoteProtocolError}):
                 event = self._h11_state.next_event()
@@ -316,14 +309,14 @@ def __repr__(self) -> str:
     # These context managers are not used in the standard flow, but are
     # useful for testing or working with connection instances directly.
 
-    def __enter__(self) -> "HTTP11Connection":
+    def __enter__(self) -> HTTP11Connection:
         return self
 
     def __exit__(
         self,
-        exc_type: Optional[Type[BaseException]] = None,
-        exc_value: Optional[BaseException] = None,
-        traceback: Optional[TracebackType] = None,
+        exc_type: type[BaseException] | None = None,
+        exc_value: BaseException | None = None,
+        traceback: types.TracebackType | None = None,
     ) -> None:
         self.close()
 
@@ -334,7 +327,7 @@ def __init__(self, connection: HTTP11Connection, request: Request) -> None:
         self._request = request
         self._closed = False
 
-    def __iter__(self) -> Iterator[bytes]:
+    def __iter__(self) -> typing.Iterator[bytes]:
         kwargs = {"request": self._request}
         try:
             with Trace("receive_response_body", logger, self._request, kwargs):
@@ -360,7 +353,7 @@ def __init__(self, stream: NetworkStream, leading_data: bytes) -> None:
         self._stream = stream
         self._leading_data = leading_data
 
-    def read(self, max_bytes: int, timeout: Optional[float] = None) -> bytes:
+    def read(self, max_bytes: int, timeout: float | None = None) -> bytes:
         if self._leading_data:
             buffer = self._leading_data[:max_bytes]
             self._leading_data = self._leading_data[max_bytes:]
@@ -368,7 +361,7 @@ def read(self, max_bytes: int, timeout: Optional[float] = None) -> bytes:
         else:
             return self._stream.read(max_bytes, timeout)
 
-    def write(self, buffer: bytes, timeout: Optional[float] = None) -> None:
+    def write(self, buffer: bytes, timeout: float | None = None) -> None:
         self._stream.write(buffer, timeout)
 
     def close(self) -> None:
@@ -377,10 +370,10 @@ def close(self) -> None:
     def start_tls(
         self,
         ssl_context: ssl.SSLContext,
-        server_hostname: Optional[str] = None,
-        timeout: Optional[float] = None,
+        server_hostname: str | None = None,
+        timeout: float | None = None,
     ) -> NetworkStream:
         return self._stream.start_tls(ssl_context, server_hostname, timeout)
 
-    def get_extra_info(self, info: str) -> Any:
+    def get_extra_info(self, info: str) -> typing.Any:
         return self._stream.get_extra_info(info)
diff --git a/contrib/python/httpcore/httpcore/_sync/http2.py b/contrib/python/httpcore/httpcore/_sync/http2.py
index 1ee4bbb34fe6..ca4dd724325c 100644
--- a/contrib/python/httpcore/httpcore/_sync/http2.py
+++ b/contrib/python/httpcore/httpcore/_sync/http2.py
@@ -1,3 +1,5 @@
+from __future__ import annotations
+
 import enum
 import logging
 import time
@@ -45,14 +47,14 @@ def __init__(
         self,
         origin: Origin,
         stream: NetworkStream,
-        keepalive_expiry: typing.Optional[float] = None,
+        keepalive_expiry: float | None = None,
     ):
         self._origin = origin
         self._network_stream = stream
-        self._keepalive_expiry: typing.Optional[float] = keepalive_expiry
+        self._keepalive_expiry: float | None = keepalive_expiry
         self._h2_state = h2.connection.H2Connection(config=self.CONFIG)
         self._state = HTTPConnectionState.IDLE
-        self._expire_at: typing.Optional[float] = None
+        self._expire_at: float | None = None
         self._request_count = 0
         self._init_lock = Lock()
         self._state_lock = Lock()
@@ -63,24 +65,20 @@ def __init__(
         self._connection_error = False
 
         # Mapping from stream ID to response stream events.
-        self._events: typing.Dict[
+        self._events: dict[
             int,
-            typing.Union[
-                h2.events.ResponseReceived,
-                h2.events.DataReceived,
-                h2.events.StreamEnded,
-                h2.events.StreamReset,
-            ],
+            h2.events.ResponseReceived
+            | h2.events.DataReceived
+            | h2.events.StreamEnded
+            | h2.events.StreamReset,
         ] = {}
 
         # Connection terminated events are stored as state since
         # we need to handle them for all streams.
-        self._connection_terminated: typing.Optional[h2.events.ConnectionTerminated] = (
-            None
-        )
+        self._connection_terminated: h2.events.ConnectionTerminated | None = None
 
-        self._read_exception: typing.Optional[Exception] = None
-        self._write_exception: typing.Optional[Exception] = None
+        self._read_exception: Exception | None = None
+        self._write_exception: Exception | None = None
 
     def handle_request(self, request: Request) -> Response:
         if not self.can_handle_request(request.url.origin):
@@ -284,7 +282,7 @@ def _send_end_stream(self, request: Request, stream_id: int) -> None:
 
     def _receive_response(
         self, request: Request, stream_id: int
-    ) -> typing.Tuple[int, typing.List[typing.Tuple[bytes, bytes]]]:
+    ) -> tuple[int, list[tuple[bytes, bytes]]]:
         """
         Return the response status code and headers for a given stream ID.
         """
@@ -321,9 +319,7 @@ def _receive_response_body(
 
     def _receive_stream_event(
         self, request: Request, stream_id: int
-    ) -> typing.Union[
-        h2.events.ResponseReceived, h2.events.DataReceived, h2.events.StreamEnded
-    ]:
+    ) -> h2.events.ResponseReceived | h2.events.DataReceived | h2.events.StreamEnded:
         """
         Return the next available event for a given stream ID.
 
@@ -337,7 +333,7 @@ def _receive_stream_event(
         return event
 
     def _receive_events(
-        self, request: Request, stream_id: typing.Optional[int] = None
+        self, request: Request, stream_id: int | None = None
     ) -> None:
         """
         Read some data from the network until we see one or more events
@@ -425,9 +421,7 @@ def close(self) -> None:
 
     # Wrappers around network read/write operations...
 
-    def _read_incoming_data(
-        self, request: Request
-    ) -> typing.List[h2.events.Event]:
+    def _read_incoming_data(self, request: Request) -> list[h2.events.Event]:
         timeouts = request.extensions.get("timeout", {})
         timeout = timeouts.get("read", None)
 
@@ -451,7 +445,7 @@ def _read_incoming_data(
             self._connection_error = True
             raise exc
 
-        events: typing.List[h2.events.Event] = self._h2_state.receive_data(data)
+        events: list[h2.events.Event] = self._h2_state.receive_data(data)
 
         return events
 
@@ -544,14 +538,14 @@ def __repr__(self) -> str:
     # These context managers are not used in the standard flow, but are
     # useful for testing or working with connection instances directly.
 
-    def __enter__(self) -> "HTTP2Connection":
+    def __enter__(self) -> HTTP2Connection:
         return self
 
     def __exit__(
         self,
-        exc_type: typing.Optional[typing.Type[BaseException]] = None,
-        exc_value: typing.Optional[BaseException] = None,
-        traceback: typing.Optional[types.TracebackType] = None,
+        exc_type: type[BaseException] | None = None,
+        exc_value: BaseException | None = None,
+        traceback: types.TracebackType | None = None,
     ) -> None:
         self.close()
 
diff --git a/contrib/python/httpcore/httpcore/_sync/http_proxy.py b/contrib/python/httpcore/httpcore/_sync/http_proxy.py
index 6acac9a7cd52..ecca88f7dc93 100644
--- a/contrib/python/httpcore/httpcore/_sync/http_proxy.py
+++ b/contrib/python/httpcore/httpcore/_sync/http_proxy.py
@@ -1,7 +1,9 @@
+from __future__ import annotations
+
+import base64
 import logging
 import ssl
-from base64 import b64encode
-from typing import Iterable, List, Mapping, Optional, Sequence, Tuple, Union
+import typing
 
 from .._backends.base import SOCKET_OPTION, NetworkBackend
 from .._exceptions import ProxyError
@@ -22,17 +24,18 @@
 from .http11 import HTTP11Connection
 from .interfaces import ConnectionInterface
 
-HeadersAsSequence = Sequence[Tuple[Union[bytes, str], Union[bytes, str]]]
-HeadersAsMapping = Mapping[Union[bytes, str], Union[bytes, str]]
+ByteOrStr = typing.Union[bytes, str]
+HeadersAsSequence = typing.Sequence[typing.Tuple[ByteOrStr, ByteOrStr]]
+HeadersAsMapping = typing.Mapping[ByteOrStr, ByteOrStr]
 
 
 logger = logging.getLogger("httpcore.proxy")
 
 
 def merge_headers(
-    default_headers: Optional[Sequence[Tuple[bytes, bytes]]] = None,
-    override_headers: Optional[Sequence[Tuple[bytes, bytes]]] = None,
-) -> List[Tuple[bytes, bytes]]:
+    default_headers: typing.Sequence[tuple[bytes, bytes]] | None = None,
+    override_headers: typing.Sequence[tuple[bytes, bytes]] | None = None,
+) -> list[tuple[bytes, bytes]]:
     """
     Append default_headers and override_headers, de-duplicating if a key exists
     in both cases.
@@ -48,33 +51,28 @@ def merge_headers(
     return default_headers + override_headers
 
 
-def build_auth_header(username: bytes, password: bytes) -> bytes:
-    userpass = username + b":" + password
-    return b"Basic " + b64encode(userpass)
-
-
-class HTTPProxy(ConnectionPool):
+class HTTPProxy(ConnectionPool):  # pragma: nocover
     """
     A connection pool that sends requests via an HTTP proxy.
     """
 
     def __init__(
         self,
-        proxy_url: Union[URL, bytes, str],
-        proxy_auth: Optional[Tuple[Union[bytes, str], Union[bytes, str]]] = None,
-        proxy_headers: Union[HeadersAsMapping, HeadersAsSequence, None] = None,
-        ssl_context: Optional[ssl.SSLContext] = None,
-        proxy_ssl_context: Optional[ssl.SSLContext] = None,
-        max_connections: Optional[int] = 10,
-        max_keepalive_connections: Optional[int] = None,
-        keepalive_expiry: Optional[float] = None,
+        proxy_url: URL | bytes | str,
+        proxy_auth: tuple[bytes | str, bytes | str] | None = None,
+        proxy_headers: HeadersAsMapping | HeadersAsSequence | None = None,
+        ssl_context: ssl.SSLContext | None = None,
+        proxy_ssl_context: ssl.SSLContext | None = None,
+        max_connections: int | None = 10,
+        max_keepalive_connections: int | None = None,
+        keepalive_expiry: float | None = None,
         http1: bool = True,
         http2: bool = False,
         retries: int = 0,
-        local_address: Optional[str] = None,
-        uds: Optional[str] = None,
-        network_backend: Optional[NetworkBackend] = None,
-        socket_options: Optional[Iterable[SOCKET_OPTION]] = None,
+        local_address: str | None = None,
+        uds: str | None = None,
+        network_backend: NetworkBackend | None = None,
+        socket_options: typing.Iterable[SOCKET_OPTION] | None = None,
     ) -> None:
         """
         A connection pool for making HTTP requests.
@@ -139,7 +137,8 @@ def __init__(
         if proxy_auth is not None:
             username = enforce_bytes(proxy_auth[0], name="proxy_auth")
             password = enforce_bytes(proxy_auth[1], name="proxy_auth")
-            authorization = build_auth_header(username, password)
+            userpass = username + b":" + password
+            authorization = b"Basic " + base64.b64encode(userpass)
             self._proxy_headers = [
                 (b"Proxy-Authorization", authorization)
             ] + self._proxy_headers
@@ -172,11 +171,11 @@ def __init__(
         self,
         proxy_origin: Origin,
         remote_origin: Origin,
-        proxy_headers: Union[HeadersAsMapping, HeadersAsSequence, None] = None,
-        keepalive_expiry: Optional[float] = None,
-        network_backend: Optional[NetworkBackend] = None,
-        socket_options: Optional[Iterable[SOCKET_OPTION]] = None,
-        proxy_ssl_context: Optional[ssl.SSLContext] = None,
+        proxy_headers: HeadersAsMapping | HeadersAsSequence | None = None,
+        keepalive_expiry: float | None = None,
+        network_backend: NetworkBackend | None = None,
+        socket_options: typing.Iterable[SOCKET_OPTION] | None = None,
+        proxy_ssl_context: ssl.SSLContext | None = None,
     ) -> None:
         self._connection = HTTPConnection(
             origin=proxy_origin,
@@ -236,14 +235,14 @@ def __init__(
         self,
         proxy_origin: Origin,
         remote_origin: Origin,
-        ssl_context: Optional[ssl.SSLContext] = None,
-        proxy_ssl_context: Optional[ssl.SSLContext] = None,
-        proxy_headers: Optional[Sequence[Tuple[bytes, bytes]]] = None,
-        keepalive_expiry: Optional[float] = None,
+        ssl_context: ssl.SSLContext | None = None,
+        proxy_ssl_context: ssl.SSLContext | None = None,
+        proxy_headers: typing.Sequence[tuple[bytes, bytes]] | None = None,
+        keepalive_expiry: float | None = None,
         http1: bool = True,
         http2: bool = False,
-        network_backend: Optional[NetworkBackend] = None,
-        socket_options: Optional[Iterable[SOCKET_OPTION]] = None,
+        network_backend: NetworkBackend | None = None,
+        socket_options: typing.Iterable[SOCKET_OPTION] | None = None,
     ) -> None:
         self._connection: ConnectionInterface = HTTPConnection(
             origin=proxy_origin,
diff --git a/contrib/python/httpcore/httpcore/_sync/interfaces.py b/contrib/python/httpcore/httpcore/_sync/interfaces.py
index 5e95be1ec724..e673d4cc1b1d 100644
--- a/contrib/python/httpcore/httpcore/_sync/interfaces.py
+++ b/contrib/python/httpcore/httpcore/_sync/interfaces.py
@@ -1,5 +1,7 @@
-from contextlib import contextmanager
-from typing import Iterator, Optional, Union
+from __future__ import annotations
+
+import contextlib
+import typing
 
 from .._models import (
     URL,
@@ -18,12 +20,12 @@
 class RequestInterface:
     def request(
         self,
-        method: Union[bytes, str],
-        url: Union[URL, bytes, str],
+        method: bytes | str,
+        url: URL | bytes | str,
         *,
         headers: HeaderTypes = None,
-        content: Union[bytes, Iterator[bytes], None] = None,
-        extensions: Optional[Extensions] = None,
+        content: bytes | typing.Iterator[bytes] | None = None,
+        extensions: Extensions | None = None,
     ) -> Response:
         # Strict type checking on our parameters.
         method = enforce_bytes(method, name="method")
@@ -47,16 +49,16 @@ def request(
             response.close()
         return response
 
-    @contextmanager
+    @contextlib.contextmanager
     def stream(
         self,
-        method: Union[bytes, str],
-        url: Union[URL, bytes, str],
+        method: bytes | str,
+        url: URL | bytes | str,
         *,
         headers: HeaderTypes = None,
-        content: Union[bytes, Iterator[bytes], None] = None,
-        extensions: Optional[Extensions] = None,
-    ) -> Iterator[Response]:
+        content: bytes | typing.Iterator[bytes] | None = None,
+        extensions: Extensions | None = None,
+    ) -> typing.Iterator[Response]:
         # Strict type checking on our parameters.
         method = enforce_bytes(method, name="method")
         url = enforce_url(url, name="url")
diff --git a/contrib/python/httpcore/httpcore/_sync/socks_proxy.py b/contrib/python/httpcore/httpcore/_sync/socks_proxy.py
index 502e4d7fef2c..0ca96ddfb580 100644
--- a/contrib/python/httpcore/httpcore/_sync/socks_proxy.py
+++ b/contrib/python/httpcore/httpcore/_sync/socks_proxy.py
@@ -1,8 +1,9 @@
+from __future__ import annotations
+
 import logging
 import ssl
-import typing
 
-from socksio import socks5
+import socksio
 
 from .._backends.sync import SyncBackend
 from .._backends.base import NetworkBackend, NetworkStream
@@ -43,24 +44,24 @@ def _init_socks5_connection(
     *,
     host: bytes,
     port: int,
-    auth: typing.Optional[typing.Tuple[bytes, bytes]] = None,
+    auth: tuple[bytes, bytes] | None = None,
 ) -> None:
-    conn = socks5.SOCKS5Connection()
+    conn = socksio.socks5.SOCKS5Connection()
 
     # Auth method request
     auth_method = (
-        socks5.SOCKS5AuthMethod.NO_AUTH_REQUIRED
+        socksio.socks5.SOCKS5AuthMethod.NO_AUTH_REQUIRED
         if auth is None
-        else socks5.SOCKS5AuthMethod.USERNAME_PASSWORD
+        else socksio.socks5.SOCKS5AuthMethod.USERNAME_PASSWORD
     )
-    conn.send(socks5.SOCKS5AuthMethodsRequest([auth_method]))
+    conn.send(socksio.socks5.SOCKS5AuthMethodsRequest([auth_method]))
     outgoing_bytes = conn.data_to_send()
     stream.write(outgoing_bytes)
 
     # Auth method response
     incoming_bytes = stream.read(max_bytes=4096)
     response = conn.receive_data(incoming_bytes)
-    assert isinstance(response, socks5.SOCKS5AuthReply)
+    assert isinstance(response, socksio.socks5.SOCKS5AuthReply)
     if response.method != auth_method:
         requested = AUTH_METHODS.get(auth_method, "UNKNOWN")
         responded = AUTH_METHODS.get(response.method, "UNKNOWN")
@@ -68,25 +69,25 @@ def _init_socks5_connection(
             f"Requested {requested} from proxy server, but got {responded}."
         )
 
-    if response.method == socks5.SOCKS5AuthMethod.USERNAME_PASSWORD:
+    if response.method == socksio.socks5.SOCKS5AuthMethod.USERNAME_PASSWORD:
         # Username/password request
         assert auth is not None
         username, password = auth
-        conn.send(socks5.SOCKS5UsernamePasswordRequest(username, password))
+        conn.send(socksio.socks5.SOCKS5UsernamePasswordRequest(username, password))
         outgoing_bytes = conn.data_to_send()
         stream.write(outgoing_bytes)
 
         # Username/password response
         incoming_bytes = stream.read(max_bytes=4096)
         response = conn.receive_data(incoming_bytes)
-        assert isinstance(response, socks5.SOCKS5UsernamePasswordReply)
+        assert isinstance(response, socksio.socks5.SOCKS5UsernamePasswordReply)
         if not response.success:
             raise ProxyError("Invalid username/password")
 
     # Connect request
     conn.send(
-        socks5.SOCKS5CommandRequest.from_address(
-            socks5.SOCKS5Command.CONNECT, (host, port)
+        socksio.socks5.SOCKS5CommandRequest.from_address(
+            socksio.socks5.SOCKS5Command.CONNECT, (host, port)
         )
     )
     outgoing_bytes = conn.data_to_send()
@@ -95,31 +96,29 @@ def _init_socks5_connection(
     # Connect response
     incoming_bytes = stream.read(max_bytes=4096)
     response = conn.receive_data(incoming_bytes)
-    assert isinstance(response, socks5.SOCKS5Reply)
-    if response.reply_code != socks5.SOCKS5ReplyCode.SUCCEEDED:
+    assert isinstance(response, socksio.socks5.SOCKS5Reply)
+    if response.reply_code != socksio.socks5.SOCKS5ReplyCode.SUCCEEDED:
         reply_code = REPLY_CODES.get(response.reply_code, "UNKOWN")
         raise ProxyError(f"Proxy Server could not connect: {reply_code}.")
 
 
-class SOCKSProxy(ConnectionPool):
+class SOCKSProxy(ConnectionPool):  # pragma: nocover
     """
     A connection pool that sends requests via an HTTP proxy.
     """
 
     def __init__(
         self,
-        proxy_url: typing.Union[URL, bytes, str],
-        proxy_auth: typing.Optional[
-            typing.Tuple[typing.Union[bytes, str], typing.Union[bytes, str]]
-        ] = None,
-        ssl_context: typing.Optional[ssl.SSLContext] = None,
-        max_connections: typing.Optional[int] = 10,
-        max_keepalive_connections: typing.Optional[int] = None,
-        keepalive_expiry: typing.Optional[float] = None,
+        proxy_url: URL | bytes | str,
+        proxy_auth: tuple[bytes | str, bytes | str] | None = None,
+        ssl_context: ssl.SSLContext | None = None,
+        max_connections: int | None = 10,
+        max_keepalive_connections: int | None = None,
+        keepalive_expiry: float | None = None,
         http1: bool = True,
         http2: bool = False,
         retries: int = 0,
-        network_backend: typing.Optional[NetworkBackend] = None,
+        network_backend: NetworkBackend | None = None,
     ) -> None:
         """
         A connection pool for making HTTP requests.
@@ -167,7 +166,7 @@ def __init__(
             username, password = proxy_auth
             username_bytes = enforce_bytes(username, name="proxy_auth")
             password_bytes = enforce_bytes(password, name="proxy_auth")
-            self._proxy_auth: typing.Optional[typing.Tuple[bytes, bytes]] = (
+            self._proxy_auth: tuple[bytes, bytes] | None = (
                 username_bytes,
                 password_bytes,
             )
@@ -192,12 +191,12 @@ def __init__(
         self,
         proxy_origin: Origin,
         remote_origin: Origin,
-        proxy_auth: typing.Optional[typing.Tuple[bytes, bytes]] = None,
-        ssl_context: typing.Optional[ssl.SSLContext] = None,
-        keepalive_expiry: typing.Optional[float] = None,
+        proxy_auth: tuple[bytes, bytes] | None = None,
+        ssl_context: ssl.SSLContext | None = None,
+        keepalive_expiry: float | None = None,
         http1: bool = True,
         http2: bool = False,
-        network_backend: typing.Optional[NetworkBackend] = None,
+        network_backend: NetworkBackend | None = None,
     ) -> None:
         self._proxy_origin = proxy_origin
         self._remote_origin = remote_origin
@@ -211,7 +210,7 @@ def __init__(
             SyncBackend() if network_backend is None else network_backend
         )
         self._connect_lock = Lock()
-        self._connection: typing.Optional[ConnectionInterface] = None
+        self._connection: ConnectionInterface | None = None
         self._connect_failed = False
 
     def handle_request(self, request: Request) -> Response:
diff --git a/contrib/python/httpcore/httpcore/_synchronization.py b/contrib/python/httpcore/httpcore/_synchronization.py
index 50cfefe0a2c8..2ecc9e9c363e 100644
--- a/contrib/python/httpcore/httpcore/_synchronization.py
+++ b/contrib/python/httpcore/httpcore/_synchronization.py
@@ -1,6 +1,7 @@
+from __future__ import annotations
+
 import threading
-from types import TracebackType
-from typing import Optional, Type
+import types
 
 from ._exceptions import ExceptionMapping, PoolTimeout, map_exceptions
 
@@ -66,7 +67,7 @@ def setup(self) -> None:
         elif self._backend == "asyncio":
             self._anyio_lock = anyio.Lock()
 
-    async def __aenter__(self) -> "AsyncLock":
+    async def __aenter__(self) -> AsyncLock:
         if not self._backend:
             self.setup()
 
@@ -79,9 +80,9 @@ async def __aenter__(self) -> "AsyncLock":
 
     async def __aexit__(
         self,
-        exc_type: Optional[Type[BaseException]] = None,
-        exc_value: Optional[BaseException] = None,
-        traceback: Optional[TracebackType] = None,
+        exc_type: type[BaseException] | None = None,
+        exc_value: BaseException | None = None,
+        traceback: types.TracebackType | None = None,
     ) -> None:
         if self._backend == "trio":
             self._trio_lock.release()
@@ -97,14 +98,14 @@ class AsyncThreadLock:
     In the async case `AsyncThreadLock` is a no-op.
     """
 
-    def __enter__(self) -> "AsyncThreadLock":
+    def __enter__(self) -> AsyncThreadLock:
         return self
 
     def __exit__(
         self,
-        exc_type: Optional[Type[BaseException]] = None,
-        exc_value: Optional[BaseException] = None,
-        traceback: Optional[TracebackType] = None,
+        exc_type: type[BaseException] | None = None,
+        exc_value: BaseException | None = None,
+        traceback: types.TracebackType | None = None,
     ) -> None:
         pass
 
@@ -133,7 +134,7 @@ def set(self) -> None:
         elif self._backend == "asyncio":
             self._anyio_event.set()
 
-    async def wait(self, timeout: Optional[float] = None) -> None:
+    async def wait(self, timeout: float | None = None) -> None:
         if not self._backend:
             self.setup()
 
@@ -206,7 +207,7 @@ def __init__(self) -> None:
         elif self._backend == "asyncio":
             self._anyio_shield = anyio.CancelScope(shield=True)
 
-    def __enter__(self) -> "AsyncShieldCancellation":
+    def __enter__(self) -> AsyncShieldCancellation:
         if self._backend == "trio":
             self._trio_shield.__enter__()
         elif self._backend == "asyncio":
@@ -215,9 +216,9 @@ def __enter__(self) -> "AsyncShieldCancellation":
 
     def __exit__(
         self,
-        exc_type: Optional[Type[BaseException]] = None,
-        exc_value: Optional[BaseException] = None,
-        traceback: Optional[TracebackType] = None,
+        exc_type: type[BaseException] | None = None,
+        exc_value: BaseException | None = None,
+        traceback: types.TracebackType | None = None,
     ) -> None:
         if self._backend == "trio":
             self._trio_shield.__exit__(exc_type, exc_value, traceback)
@@ -239,15 +240,15 @@ class Lock:
     def __init__(self) -> None:
         self._lock = threading.Lock()
 
-    def __enter__(self) -> "Lock":
+    def __enter__(self) -> Lock:
         self._lock.acquire()
         return self
 
     def __exit__(
         self,
-        exc_type: Optional[Type[BaseException]] = None,
-        exc_value: Optional[BaseException] = None,
-        traceback: Optional[TracebackType] = None,
+        exc_type: type[BaseException] | None = None,
+        exc_value: BaseException | None = None,
+        traceback: types.TracebackType | None = None,
     ) -> None:
         self._lock.release()
 
@@ -263,15 +264,15 @@ class ThreadLock:
     def __init__(self) -> None:
         self._lock = threading.Lock()
 
-    def __enter__(self) -> "ThreadLock":
+    def __enter__(self) -> ThreadLock:
         self._lock.acquire()
         return self
 
     def __exit__(
         self,
-        exc_type: Optional[Type[BaseException]] = None,
-        exc_value: Optional[BaseException] = None,
-        traceback: Optional[TracebackType] = None,
+        exc_type: type[BaseException] | None = None,
+        exc_value: BaseException | None = None,
+        traceback: types.TracebackType | None = None,
     ) -> None:
         self._lock.release()
 
@@ -283,7 +284,7 @@ def __init__(self) -> None:
     def set(self) -> None:
         self._event.set()
 
-    def wait(self, timeout: Optional[float] = None) -> None:
+    def wait(self, timeout: float | None = None) -> None:
         if timeout == float("inf"):  # pragma: no cover
             timeout = None
         if not self._event.wait(timeout=timeout):
@@ -305,13 +306,13 @@ class ShieldCancellation:
     # Thread-synchronous codebases don't support cancellation semantics.
     # We have this class because we need to mirror the async and sync
     # cases within our package, but it's just a no-op.
-    def __enter__(self) -> "ShieldCancellation":
+    def __enter__(self) -> ShieldCancellation:
         return self
 
     def __exit__(
         self,
-        exc_type: Optional[Type[BaseException]] = None,
-        exc_value: Optional[BaseException] = None,
-        traceback: Optional[TracebackType] = None,
+        exc_type: type[BaseException] | None = None,
+        exc_value: BaseException | None = None,
+        traceback: types.TracebackType | None = None,
     ) -> None:
         pass
diff --git a/contrib/python/httpcore/httpcore/_trace.py b/contrib/python/httpcore/httpcore/_trace.py
index b122a53e88f1..5f1cd7c47829 100644
--- a/contrib/python/httpcore/httpcore/_trace.py
+++ b/contrib/python/httpcore/httpcore/_trace.py
@@ -1,7 +1,9 @@
+from __future__ import annotations
+
 import inspect
 import logging
-from types import TracebackType
-from typing import Any, Dict, Optional, Type
+import types
+import typing
 
 from ._models import Request
 
@@ -11,8 +13,8 @@ def __init__(
         self,
         name: str,
         logger: logging.Logger,
-        request: Optional[Request] = None,
-        kwargs: Optional[Dict[str, Any]] = None,
+        request: Request | None = None,
+        kwargs: dict[str, typing.Any] | None = None,
     ) -> None:
         self.name = name
         self.logger = logger
@@ -21,11 +23,11 @@ def __init__(
         )
         self.debug = self.logger.isEnabledFor(logging.DEBUG)
         self.kwargs = kwargs or {}
-        self.return_value: Any = None
+        self.return_value: typing.Any = None
         self.should_trace = self.debug or self.trace_extension is not None
         self.prefix = self.logger.name.split(".")[-1]
 
-    def trace(self, name: str, info: Dict[str, Any]) -> None:
+    def trace(self, name: str, info: dict[str, typing.Any]) -> None:
         if self.trace_extension is not None:
             prefix_and_name = f"{self.prefix}.{name}"
             ret = self.trace_extension(prefix_and_name, info)
@@ -44,7 +46,7 @@ def trace(self, name: str, info: Dict[str, Any]) -> None:
                 message = f"{name} {args}"
             self.logger.debug(message)
 
-    def __enter__(self) -> "Trace":
+    def __enter__(self) -> Trace:
         if self.should_trace:
             info = self.kwargs
             self.trace(f"{self.name}.started", info)
@@ -52,9 +54,9 @@ def __enter__(self) -> "Trace":
 
     def __exit__(
         self,
-        exc_type: Optional[Type[BaseException]] = None,
-        exc_value: Optional[BaseException] = None,
-        traceback: Optional[TracebackType] = None,
+        exc_type: type[BaseException] | None = None,
+        exc_value: BaseException | None = None,
+        traceback: types.TracebackType | None = None,
     ) -> None:
         if self.should_trace:
             if exc_value is None:
@@ -64,7 +66,7 @@ def __exit__(
                 info = {"exception": exc_value}
                 self.trace(f"{self.name}.failed", info)
 
-    async def atrace(self, name: str, info: Dict[str, Any]) -> None:
+    async def atrace(self, name: str, info: dict[str, typing.Any]) -> None:
         if self.trace_extension is not None:
             prefix_and_name = f"{self.prefix}.{name}"
             coro = self.trace_extension(prefix_and_name, info)
@@ -84,7 +86,7 @@ async def atrace(self, name: str, info: Dict[str, Any]) -> None:
                 message = f"{name} {args}"
             self.logger.debug(message)
 
-    async def __aenter__(self) -> "Trace":
+    async def __aenter__(self) -> Trace:
         if self.should_trace:
             info = self.kwargs
             await self.atrace(f"{self.name}.started", info)
@@ -92,9 +94,9 @@ async def __aenter__(self) -> "Trace":
 
     async def __aexit__(
         self,
-        exc_type: Optional[Type[BaseException]] = None,
-        exc_value: Optional[BaseException] = None,
-        traceback: Optional[TracebackType] = None,
+        exc_type: type[BaseException] | None = None,
+        exc_value: BaseException | None = None,
+        traceback: types.TracebackType | None = None,
     ) -> None:
         if self.should_trace:
             if exc_value is None:
diff --git a/contrib/python/httpcore/httpcore/_utils.py b/contrib/python/httpcore/httpcore/_utils.py
index df5dea8fe472..c44ff93cb2f5 100644
--- a/contrib/python/httpcore/httpcore/_utils.py
+++ b/contrib/python/httpcore/httpcore/_utils.py
@@ -1,10 +1,11 @@
+from __future__ import annotations
+
 import select
 import socket
 import sys
-import typing
 
 
-def is_socket_readable(sock: typing.Optional[socket.socket]) -> bool:
+def is_socket_readable(sock: socket.socket | None) -> bool:
     """
     Return whether a socket, as identifed by its file descriptor, is readable.
     "A socket is readable" means that the read buffer isn't empty, i.e. that calling
diff --git a/contrib/python/httpcore/ya.make b/contrib/python/httpcore/ya.make
index 30d63fe78448..6d4f3507cab8 100644
--- a/contrib/python/httpcore/ya.make
+++ b/contrib/python/httpcore/ya.make
@@ -2,7 +2,7 @@
 
 PY3_LIBRARY()
 
-VERSION(1.0.6)
+VERSION(1.0.7)
 
 LICENSE(BSD-3-Clause)
 

From 4a7ef6510b83e56e970f84dfe36d5db561f360e0 Mon Sep 17 00:00:00 2001
From: aneporada <aneporada@yandex-team.com>
Date: Sat, 30 Nov 2024 12:59:34 +0300
Subject: [PATCH 12/16] Add CBO warnings
 commit_hash:6d989d85970cb7fd0233853c26d44ff8b5ba3c8d

---
 yql/essentials/core/issue/protos/issue_id.proto | 4 ++++
 yql/essentials/core/issue/yql_issue.txt         | 8 ++++++++
 2 files changed, 12 insertions(+)

diff --git a/yql/essentials/core/issue/protos/issue_id.proto b/yql/essentials/core/issue/protos/issue_id.proto
index 975efe5e3be4..688a471afbdc 100644
--- a/yql/essentials/core/issue/protos/issue_id.proto
+++ b/yql/essentials/core/issue/protos/issue_id.proto
@@ -195,6 +195,10 @@ message TIssuesIds {
         PG_NO_LOCKING_SUPPORT = 7000;
         PG_COMPAT = 7001;
 
+// CBO
+        CBO_ENUM_LIMIT_REACHED = 8000;
+        CBO_MISSING_TABLE_STATS = 8001;
+
 // range [200000, 399999) reserved for KiKiMR issue codes, do not use!
 
     }
diff --git a/yql/essentials/core/issue/yql_issue.txt b/yql/essentials/core/issue/yql_issue.txt
index 82a0a1eecd31..148363532c7e 100644
--- a/yql/essentials/core/issue/yql_issue.txt
+++ b/yql/essentials/core/issue/yql_issue.txt
@@ -683,3 +683,11 @@ ids {
   code: CORE_TOP_UNSUPPORTED_BLOCK_CALLABLES
   severity: S_INFO
 }
+ids {
+  code: CBO_ENUM_LIMIT_REACHED
+  severity: S_WARNING
+}
+ids {
+  code: CBO_MISSING_TABLE_STATS
+  severity: S_WARNING
+}

From 878e5db788b99c7ea5be8edaa14c22eace57e991 Mon Sep 17 00:00:00 2001
From: robot-piglet <robot-piglet@yandex-team.com>
Date: Sat, 30 Nov 2024 19:42:30 +0300
Subject: [PATCH 13/16] Intermediate changes
 commit_hash:20edff6a454b566f7480e5b5a93697181e8e675b

---
 .../python/aioresponses/.dist-info/METADATA   | 22 ++++++++++--
 contrib/python/aioresponses/AUTHORS           |  5 +++
 contrib/python/aioresponses/README.rst        | 17 +++++++++
 .../aioresponses/aioresponses/__init__.py     |  2 +-
 .../python/aioresponses/aioresponses/core.py  | 11 +++++-
 .../aioresponses/patches/01-fix-tests.patch   | 35 ++++++++-----------
 .../aioresponses/tests/test_aioresponses.py   | 30 ++++++++++++++++
 contrib/python/aioresponses/tests/ya.make     | 10 ++----
 contrib/python/aioresponses/ya.make           |  3 +-
 9 files changed, 102 insertions(+), 33 deletions(-)

diff --git a/contrib/python/aioresponses/.dist-info/METADATA b/contrib/python/aioresponses/.dist-info/METADATA
index 54b686eb717d..fc6fdc4a16c2 100644
--- a/contrib/python/aioresponses/.dist-info/METADATA
+++ b/contrib/python/aioresponses/.dist-info/METADATA
@@ -1,6 +1,6 @@
 Metadata-Version: 2.1
 Name: aioresponses
-Version: 0.7.6
+Version: 0.7.7
 Summary: Mock out requests made by ClientSession from aiohttp package
 Home-page: https://github.com/pnuckowski/aioresponses
 Author: Pawel Nuckowski
@@ -22,7 +22,8 @@ Classifier: Programming Language :: Python :: 3.11
 License-File: LICENSE
 License-File: AUTHORS
 License-File: AUTHORS.rst
-Requires-Dist: aiohttp (<4.0.0,>=3.3.0)
+Requires-Dist: packaging>=22.0
+Requires-Dist: aiohttp<4.0.0,>=3.3.0
 
 ===============================
 aioresponses
@@ -258,6 +259,23 @@ E.g. for cases you want to test retrying mechanisms
         # this will actually perform a request
         resp = loop.run_until_complete(session.get('http://backend/api'))
 
+**also you can passthrough all requests except specified by mocking object**
+
+.. code:: python
+
+    import asyncio
+    import aiohttp
+    from aioresponses import aioresponses
+
+    @aioresponses(passthrough_unmatched=True)
+    def test_passthrough_unmatched(m, test_client):
+        url = 'https://httpbin.org/get'
+        m.get(url, status=200)
+        session = aiohttp.ClientSession()
+        # this will actually perform a request
+        resp = loop.run_until_complete(session.get('http://backend/api'))
+        # this will not perform a request and resp2.status will return 200
+        resp2 = loop.run_until_complete(session.get(url))
 
 **aioresponses allows to throw an exception**
 
diff --git a/contrib/python/aioresponses/AUTHORS b/contrib/python/aioresponses/AUTHORS
index 3854a2941201..5635482f3f9f 100644
--- a/contrib/python/aioresponses/AUTHORS
+++ b/contrib/python/aioresponses/AUTHORS
@@ -12,6 +12,7 @@ Bryce Drennan <github@accounts.brycedrennan.com>
 Colin-b <Colin-b@users.noreply.github.com>
 Daniel Hahler <git@thequod.de>
 Daniel Tan <danieltanjiawang@gmail.com>
+Daniël van Noord <13665637+DanielNoord@users.noreply.github.com>
 David Buxton <david@gasmark6.com>
 Fred Thomsen <fred.thomsen@sciencelogic.com>
 Georg Sauthoff <mail@gms.tf>
@@ -20,11 +21,13 @@ Hadrien David <hadrien.david@dialogue.co>
 Hadrien David <hadrien@ectobal.com>
 Ibrahim <8592115+iamibi@users.noreply.github.com>
 Ilaï Deutel <ilai-deutel@users.noreply.github.com>
+J. Nick Koston <nick@koston.org>
 Jakub Boukal <www.bagr@gmail.com>
 Joongi Kim <me@daybreaker.info>
 Jordi Soucheiron <jordi@soucheiron.cat>
 Jordi Soucheiron <jsoucheiron@users.noreply.github.com>
 Joshua Coats <joshu@fearchar.net>
+Juan Calderon-Perez <835733+gaby@users.noreply.github.com>
 Juan Cruz <juancruzmencia@gmail.com>
 Lee Treveil <leetreveil@gmail.com>
 Louis Sautier <sautier.louis@gmail.com>
@@ -37,6 +40,7 @@ Pawel Nuckowski <p.nuckowski@gmail.com>
 Petr Belskiy <petr.belskiy@gmail.com>
 Rémy HUBSCHER <rhubscher@mozilla.com>
 Sam Bull <aa6bs0@sambull.org>
+Stephane Chausson <stephane.chausson@dailymotion.com>
 TyVik <tyvik8@gmail.com>
 Ulrik Johansson <ulrik.johansson@blocket.se>
 Ville Skyttä <ville.skytta@iki.fi>
@@ -45,6 +49,7 @@ iamnotaprogrammer <iamnotaprogrammer@yandex.ru>
 iamnotaprogrammer <issmirnov@domclick.ru>
 konstantin <konstantin.klein@hochfrequenz.de>
 oren0e <countx@gmail.com>
+outp1 <outplayed2511@mail.ru>
 pnuckowski <p.nuckowski@gmail.com>
 pnuckowski <pnuckowski@users.noreply.github.com>
 pyup-bot <github-bot@pyup.io>
diff --git a/contrib/python/aioresponses/README.rst b/contrib/python/aioresponses/README.rst
index ae63650d0a2f..4f7f6e9bf5e3 100644
--- a/contrib/python/aioresponses/README.rst
+++ b/contrib/python/aioresponses/README.rst
@@ -232,6 +232,23 @@ E.g. for cases you want to test retrying mechanisms
         # this will actually perform a request
         resp = loop.run_until_complete(session.get('http://backend/api'))
 
+**also you can passthrough all requests except specified by mocking object**
+
+.. code:: python
+
+    import asyncio
+    import aiohttp
+    from aioresponses import aioresponses
+
+    @aioresponses(passthrough_unmatched=True)
+    def test_passthrough_unmatched(m, test_client):
+        url = 'https://httpbin.org/get'
+        m.get(url, status=200)
+        session = aiohttp.ClientSession()
+        # this will actually perform a request
+        resp = loop.run_until_complete(session.get('http://backend/api'))
+        # this will not perform a request and resp2.status will return 200
+        resp2 = loop.run_until_complete(session.get(url))
 
 **aioresponses allows to throw an exception**
 
diff --git a/contrib/python/aioresponses/aioresponses/__init__.py b/contrib/python/aioresponses/aioresponses/__init__.py
index c61652c9aa05..3cafd214cf8b 100644
--- a/contrib/python/aioresponses/aioresponses/__init__.py
+++ b/contrib/python/aioresponses/aioresponses/__init__.py
@@ -1,7 +1,7 @@
 # -*- coding: utf-8 -*-
 from .core import CallbackResult, aioresponses
 
-__version__ = '0.7.3'
+__version__ = '0.7.6'
 
 __all__ = [
     'CallbackResult',
diff --git a/contrib/python/aioresponses/aioresponses/core.py b/contrib/python/aioresponses/aioresponses/core.py
index 2bb6d57365da..6346ecfff0ff 100644
--- a/contrib/python/aioresponses/aioresponses/core.py
+++ b/contrib/python/aioresponses/aioresponses/core.py
@@ -155,6 +155,7 @@ def _build_response(self, url: 'Union[URL, str]',
             url=url,
             method=method,
             headers=CIMultiDictProxy(CIMultiDict(**request_headers)),
+            real_url=url
         )
         kwargs['writer'] = None
         kwargs['continue100'] = None
@@ -225,6 +226,7 @@ class aioresponses(object):
     def __init__(self, **kwargs: Any):
         self._param = kwargs.pop('param', None)
         self._passthrough = kwargs.pop('passthrough', [])
+        self.passthrough_unmatched = kwargs.pop('passthrough_unmatched', False)
         self.patcher = patch('aiohttp.client.ClientSession._request',
                              side_effect=self._request_mock,
                              autospec=True)
@@ -512,6 +514,10 @@ async def _request_mock(self, orig_self: ClientSession,
         response = await self.match(method, url, **kwargs)
 
         if response is None:
+            if self.passthrough_unmatched:
+                return (await self.patcher.temp_original(
+                    orig_self, method, url_origin, *args, **kwargs
+                ))
             raise ClientConnectionError(
                 'Connection refused: {} {}'.format(method, url)
             )
@@ -527,7 +533,10 @@ async def _request_mock(self, orig_self: ClientSession,
             raise_for_status = getattr(
                 orig_self, '_raise_for_status', False
             )
-        if raise_for_status:
+
+        if callable(raise_for_status):
+            await raise_for_status(response)
+        elif raise_for_status:
             response.raise_for_status()
 
         return response
diff --git a/contrib/python/aioresponses/patches/01-fix-tests.patch b/contrib/python/aioresponses/patches/01-fix-tests.patch
index 2597c8f8385b..d9e9f918601c 100644
--- a/contrib/python/aioresponses/patches/01-fix-tests.patch
+++ b/contrib/python/aioresponses/patches/01-fix-tests.patch
@@ -83,33 +83,15 @@
  
      @aioresponses()
      def test_returned_instance(self, m):
-@@ -369,7 +390,7 @@ class AIOResponsesTestCase(AsyncTestCase):
-             assert str(exception_info.exception) == "Session is closed"
- 
-     async def test_address_as_instance_of_url_combined_with_pass_through(self):
+@@ -369,1 +390,1 @@ class AIOResponsesTestCase(AsyncTestCase):
 -        external_api = 'http://httpbin.org/status/201'
 +        external_api = str(self.external_server.make_url('/status/201'))
- 
-         async def doit():
-             api_resp = await self.session.get(self.url)
-@@ -386,7 +407,7 @@ class AIOResponsesTestCase(AsyncTestCase):
-             self.assertEqual(ext.status, 201)
- 
-     async def test_pass_through_with_origin_params(self):
+@@ -386,1 +407,1 @@ class AIOResponsesTestCase(AsyncTestCase):
 -        external_api = 'http://httpbin.org/get'
 +        external_api = str(self.external_server.make_url('/get'))
- 
-         async def doit(params):
-             # we have to hit actual url,
-@@ -400,7 +421,7 @@ class AIOResponsesTestCase(AsyncTestCase):
-             params = {'foo': 'bar'}
-             ext = await doit(params=params)
-             self.assertEqual(ext.status, 200)
+@@ -400,1 +421,1 @@ class AIOResponsesTestCase(AsyncTestCase):
 -            self.assertEqual(str(ext.url), 'http://httpbin.org/get?foo=bar')
 +            self.assertEqual(str(ext.url), external_api + '?foo=bar')
- 
-     @aioresponses()
-     async def test_custom_response_class(self, m):
 --- contrib/python/aioresponses/tests/test_compat.py	(index)
 +++ contrib/python/aioresponses/tests/test_compat.py	(working tree)
 @@ -2,7 +2,6 @@
@@ -188,3 +170,14 @@
 +                url = get_url(self.url_without_parameters, as_str)
 +
 +                self.assertEqual(merge_params(url, {'x': 42}), expected_url)
+--- contrib/python/aioresponses/tests/test_aioresponses.py	(index)
++++ contrib/python/aioresponses/tests/test_aioresponses.py	(working tree)
+@@ -818,7 +818,7 @@ class AIOResponseRedirectTest(AsyncTestCase):
+         self.assertEqual(len(response.history), 1)
+         self.assertEqual(str(response.history[0].url), url)
+ 
+-    async def test_pass_through_unmatched_requests(self):
++    async def _test_pass_through_unmatched_requests(self):
+         matched_url = "https://matched_example.org"
+         unmatched_url = "https://httpbin.org/get"
+         params_unmatched = {'foo': 'bar'}
diff --git a/contrib/python/aioresponses/tests/test_aioresponses.py b/contrib/python/aioresponses/tests/test_aioresponses.py
index 0555bdcd2954..b380ac5c7c3d 100644
--- a/contrib/python/aioresponses/tests/test_aioresponses.py
+++ b/contrib/python/aioresponses/tests/test_aioresponses.py
@@ -697,6 +697,21 @@ async def test_do_not_raise_for_status(self, m):
 
         self.assertEqual(response.status, 400)
 
+    @aioresponses()
+    @skipIf(condition=AIOHTTP_VERSION < Version('3.9.0'),
+            reason='aiohttp<3.9.0 does not support callable raise_for_status '
+                   'arguments for requests')
+    async def test_callable_raise_for_status(self, m):
+        async def raise_for_status(response: ClientResponse):
+            if response.status >= 400:
+                raise Exception("callable raise_for_status")
+
+        m.get(self.url, status=400)
+        with self.assertRaises(Exception) as cm:
+            await self.session.get(self.url,
+                                   raise_for_status=raise_for_status)
+        self.assertEqual(str(cm.exception), "callable raise_for_status")
+
 
 class AIOResponseRedirectTest(AsyncTestCase):
 
@@ -802,3 +817,18 @@ async def test_relative_url_redirect_followed(self, rsps):
         self.assertEqual(str(response.url), f"{base_url}/baz")
         self.assertEqual(len(response.history), 1)
         self.assertEqual(str(response.history[0].url), url)
+
+    async def _test_pass_through_unmatched_requests(self):
+        matched_url = "https://matched_example.org"
+        unmatched_url = "https://httpbin.org/get"
+        params_unmatched = {'foo': 'bar'}
+
+        with aioresponses(passthrough_unmatched=True) as m:
+            m.post(URL(matched_url), status=200)
+            mocked_response = await self.session.post(URL(matched_url))
+            response = await self.session.get(
+                URL(unmatched_url), params=params_unmatched
+            )
+            self.assertEqual(response.status, 200)
+            self.assertEqual(str(response.url), 'https://httpbin.org/get?foo=bar')
+            self.assertEqual(mocked_response.status, 200)
diff --git a/contrib/python/aioresponses/tests/ya.make b/contrib/python/aioresponses/tests/ya.make
index 95d5a5994681..9276486b1132 100644
--- a/contrib/python/aioresponses/tests/ya.make
+++ b/contrib/python/aioresponses/tests/ya.make
@@ -1,17 +1,13 @@
-SUBSCRIBER(g:python-contrib)
-
 PY3TEST()
 
+SUBSCRIBER(g:python-contrib)
+
 NO_LINT()
 
 PEERDIR(
     contrib/python/aioresponses
 )
 
-TEST_SRCS(
-    base.py
-    test_aioresponses.py
-    test_compat.py
-)
+ALL_PYTEST_SRCS()
 
 END()
diff --git a/contrib/python/aioresponses/ya.make b/contrib/python/aioresponses/ya.make
index 574b5f85f13f..50df6052a328 100644
--- a/contrib/python/aioresponses/ya.make
+++ b/contrib/python/aioresponses/ya.make
@@ -2,12 +2,13 @@
 
 PY3_LIBRARY()
 
-VERSION(0.7.6)
+VERSION(0.7.7)
 
 LICENSE(MIT)
 
 PEERDIR(
     contrib/python/aiohttp
+    contrib/python/packaging
 )
 
 NO_LINT()

From 1a14888488e86ce1cea6ed977e98b65a24577b44 Mon Sep 17 00:00:00 2001
From: babenko <babenko@yandex-team.com>
Date: Sat, 30 Nov 2024 19:44:10 +0300
Subject: [PATCH 14/16] Deprecate memory tags API in yt codebase
 commit_hash:90937e5cd8a5f01663a1f162955925e299c3d892

---
 .../concurrency/fiber_scheduler_thread.cpp    |  15 --
 yt/yt/core/concurrency/fls-inl.h              |   3 -
 .../concurrency/unittests/scheduler_ut.cpp    |  63 -----
 yt/yt/core/misc/ref_counted_tracker.cpp       |   6 -
 yt/yt/core/misc/unittests/memory_tag_ut.cpp   | 243 ------------------
 yt/yt/core/misc/unittests/ya.make             |   1 -
 .../rpc/unittests/rpc_allocation_tags_ut.cpp  |   6 +-
 .../tracing/unittests/allocation_tags_ut.cpp  |   6 +-
 .../ytprof/unittests/heap_profiler_ut.cpp     |   8 +-
 9 files changed, 10 insertions(+), 341 deletions(-)
 delete mode 100644 yt/yt/core/misc/unittests/memory_tag_ut.cpp

diff --git a/yt/yt/core/concurrency/fiber_scheduler_thread.cpp b/yt/yt/core/concurrency/fiber_scheduler_thread.cpp
index 72130a6717a5..f4af46f4712c 100644
--- a/yt/yt/core/concurrency/fiber_scheduler_thread.cpp
+++ b/yt/yt/core/concurrency/fiber_scheduler_thread.cpp
@@ -20,8 +20,6 @@
 
 #include <library/cpp/yt/global/variable.h>
 
-#include <library/cpp/yt/memory/memory_tag.h>
-
 #include <library/cpp/yt/memory/function_view.h>
 
 #include <library/cpp/yt/threading/fork_aware_spin_lock.h>
@@ -180,13 +178,6 @@ class TFiberContextGuard
 
 ////////////////////////////////////////////////////////////////////////////////
 
-Y_FORCE_INLINE TMemoryTag SwapMemoryTag(TMemoryTag tag)
-{
-    auto result = GetCurrentMemoryTag();
-    SetCurrentMemoryTag(tag);
-    return result;
-}
-
 Y_FORCE_INLINE TFiberId SwapCurrentFiberId(TFiberId fiberId)
 {
     auto result = GetCurrentFiberId();
@@ -818,7 +809,6 @@ class TBaseSwitchHandler
     void OnSwitch()
     {
         FiberId_ = SwapCurrentFiberId(FiberId_);
-        MemoryTag_ = SwapMemoryTag(MemoryTag_);
         Fls_ = SwapCurrentFls(Fls_);
         MinLogLevel_ = SwapMinLogLevel(MinLogLevel_);
     }
@@ -826,13 +816,11 @@ class TBaseSwitchHandler
     ~TBaseSwitchHandler()
     {
         YT_VERIFY(FiberId_ == InvalidFiberId);
-        YT_VERIFY(MemoryTag_ == NullMemoryTag);
         YT_VERIFY(!Fls_);
         YT_VERIFY(MinLogLevel_ == ELogLevel::Minimum);
     }
 
 private:
-    TMemoryTag MemoryTag_ = NullMemoryTag;
     TFls* Fls_ = nullptr;
     TFiberId FiberId_ = InvalidFiberId;
     ELogLevel MinLogLevel_ = ELogLevel::Minimum;
@@ -1147,7 +1135,6 @@ TFiberCanceler GetCurrentFiberCanceler()
     }
 
     if (!switchHandler->Canceler()) {
-        TMemoryTagGuard guard(NullMemoryTag);
         switchHandler->Canceler() = New<NDetail::TCanceler>(GetCurrentFiberId());
     }
 
@@ -1162,8 +1149,6 @@ void WaitUntilSet(TFuture<void> future, IInvokerPtr invoker)
     YT_VERIFY(future);
     YT_ASSERT(invoker);
 
-    TMemoryTagGuard memoryTagGuard(NullMemoryTag);
-
     auto* currentFiber = NDetail::TryGetCurrentFiber();
     if (!currentFiber) {
         // When called from a fiber-unfriendly context, we fallback to blocking wait.
diff --git a/yt/yt/core/concurrency/fls-inl.h b/yt/yt/core/concurrency/fls-inl.h
index 4f8eba6b6f87..6267f7f0547d 100644
--- a/yt/yt/core/concurrency/fls-inl.h
+++ b/yt/yt/core/concurrency/fls-inl.h
@@ -5,8 +5,6 @@
 #endif
 #undef FLS_INL_H_
 
-#include <library/cpp/yt/memory/memory_tag.h>
-
 #include <library/cpp/yt/misc/tls.h>
 
 namespace NYT::NConcurrency {
@@ -97,7 +95,6 @@ Y_FORCE_INLINE T* TFlsSlot<T>::GetOrCreate() const
 template <class T>
 T* TFlsSlot<T>::Create() const
 {
-    TMemoryTagGuard guard(NullMemoryTag);
     auto cookie = new T();
     GetCurrentFls()->Set(Index_, cookie);
     return static_cast<T*>(cookie);
diff --git a/yt/yt/core/concurrency/unittests/scheduler_ut.cpp b/yt/yt/core/concurrency/unittests/scheduler_ut.cpp
index 15320fae90bf..957c1f2ddf8e 100644
--- a/yt/yt/core/concurrency/unittests/scheduler_ut.cpp
+++ b/yt/yt/core/concurrency/unittests/scheduler_ut.cpp
@@ -1042,31 +1042,6 @@ TEST_W(TSchedulerTest, CancelDelayedFuture)
     EXPECT_EQ(NYT::EErrorCode::Generic, error.InnerErrors()[0].GetCode());
 }
 
-class TVerifyingMemoryTagGuard
-{
-public:
-    explicit TVerifyingMemoryTagGuard(TMemoryTag tag)
-        : Tag_(tag)
-        , SavedTag_(GetCurrentMemoryTag())
-    {
-        SetCurrentMemoryTag(Tag_);
-    }
-
-    ~TVerifyingMemoryTagGuard()
-    {
-        auto tag = GetCurrentMemoryTag();
-        EXPECT_EQ(tag, Tag_);
-        SetCurrentMemoryTag(SavedTag_);
-    }
-
-    TVerifyingMemoryTagGuard(const TVerifyingMemoryTagGuard& other) = delete;
-    TVerifyingMemoryTagGuard(TVerifyingMemoryTagGuard&& other) = delete;
-
-private:
-    const TMemoryTag Tag_;
-    const TMemoryTag SavedTag_;
-};
-
 class TWrappingInvoker
     : public TInvokerWrapper<false>
 {
@@ -1094,44 +1069,6 @@ class TWrappingInvoker
     void virtual DoRunCallback(TClosure callback) = 0;
 };
 
-class TVerifyingMemoryTaggingInvoker
-    : public TWrappingInvoker
-{
-public:
-    TVerifyingMemoryTaggingInvoker(IInvokerPtr invoker, TMemoryTag memoryTag)
-        : TWrappingInvoker(std::move(invoker))
-        , MemoryTag_(memoryTag)
-    { }
-
-private:
-    const TMemoryTag MemoryTag_;
-
-    void DoRunCallback(TClosure callback) override
-    {
-        TVerifyingMemoryTagGuard memoryTagGuard(MemoryTag_);
-        callback();
-    }
-};
-
-TEST_W(TSchedulerTest, MemoryTagAndResumer)
-{
-    auto actionQueue = New<TActionQueue>();
-
-    auto invoker1 = New<TVerifyingMemoryTaggingInvoker>(actionQueue->GetInvoker(), 1);
-    auto invoker2 = New<TVerifyingMemoryTaggingInvoker>(actionQueue->GetInvoker(), 2);
-
-    auto asyncResult = BIND([=] {
-        EXPECT_EQ(GetCurrentMemoryTag(), 1u);
-        SwitchTo(invoker2);
-        EXPECT_EQ(GetCurrentMemoryTag(), 1u);
-    })
-        .AsyncVia(invoker1)
-        .Run();
-
-    WaitFor(asyncResult)
-        .ThrowOnError();
-}
-
 void CheckTraceContextTime(const NTracing::TTraceContextPtr& traceContext, TDuration lo, TDuration hi)
 {
     auto actual = traceContext->GetElapsedTime();
diff --git a/yt/yt/core/misc/ref_counted_tracker.cpp b/yt/yt/core/misc/ref_counted_tracker.cpp
index 85d1e0fd1a23..21f9eb815fd8 100644
--- a/yt/yt/core/misc/ref_counted_tracker.cpp
+++ b/yt/yt/core/misc/ref_counted_tracker.cpp
@@ -6,8 +6,6 @@
 
 #include <library/cpp/yt/string/format.h>
 
-#include <library/cpp/yt/memory/memory_tag.h>
-
 #include <library/cpp/yt/misc/tls.h>
 
 #include <algorithm>
@@ -402,8 +400,6 @@ void TRefCountedTracker::FreeSpaceSlow(TRefCountedTypeCookie cookie, size_t spac
 
 TRefCountedTracker::TLocalSlot* TRefCountedTracker::GetLocalSlot(TRefCountedTypeCookie cookie)
 {
-    TMemoryTagGuard memoryTagGuard(NullMemoryTag);
-
     struct TReclaimer
     {
         ~TReclaimer()
@@ -460,8 +456,6 @@ TRefCountedTracker::TLocalSlot* TRefCountedTracker::GetLocalSlot(TRefCountedType
 
 TRefCountedTracker::TGlobalSlot* TRefCountedTracker::GetGlobalSlot(TRefCountedTypeCookie cookie)
 {
-    TMemoryTagGuard memoryTagGuard(NullMemoryTag);
-
     VERIFY_SPINLOCK_AFFINITY(SpinLock_);
     auto index = cookie.Underlying();
     if (index >= std::ssize(GlobalSlots_)) {
diff --git a/yt/yt/core/misc/unittests/memory_tag_ut.cpp b/yt/yt/core/misc/unittests/memory_tag_ut.cpp
deleted file mode 100644
index f74563f64d3b..000000000000
--- a/yt/yt/core/misc/unittests/memory_tag_ut.cpp
+++ /dev/null
@@ -1,243 +0,0 @@
-#include <yt/yt/core/test_framework/framework.h>
-
-#include <yt/yt/core/actions/invoker_util.h>
-
-#include <yt/yt/core/concurrency/action_queue.h>
-#include <yt/yt/core/concurrency/thread_pool.h>
-#include <yt/yt/core/concurrency/scheduler.h>
-
-#include <library/cpp/yt/memory/memory_tag.h>
-
-#include <util/random/random.h>
-
-#include <util/system/compiler.h>
-
-// These tests do not work under MSAN and ASAN.
-#if !defined(_msan_enabled_) and !defined(_asan_enabled_) and defined(_linux_) and defined(YT_ALLOC_ENABLED)
-
-namespace NYT {
-namespace {
-
-////////////////////////////////////////////////////////////////////////////////
-
-// Used for fake side effects to disable compiler optimizations.
-volatile const void* FakeSideEffectVolatileVariable = nullptr;
-
-////////////////////////////////////////////////////////////////////////////////
-
-using namespace NConcurrency;
-using namespace ::testing;
-
-////////////////////////////////////////////////////////////////////////////////
-
-class TMemoryTagTest
-    : public TestWithParam<void(*)()>
-{
-public:
-    TMemoryTagTest() = default;
-};
-
-////////////////////////////////////////////////////////////////////////////////
-
-// Allocate vector that results in exactly `size` memory usage considering the 16-byte header.
-std::vector<char> MakeAllocation(size_t size)
-{
-    YT_VERIFY(IsPowerOf2(size));
-
-    auto result = std::vector<char>(size);
-
-    // We make fake side effect to prevent any compiler optimizations here.
-    // (Clever compilers like to throw away our unused allocations).
-    FakeSideEffectVolatileVariable = result.data();
-    return result;
-}
-
-////////////////////////////////////////////////////////////////////////////////
-
-void TestStackingGuards()
-{
-    TMemoryTagGuard guard1(1);
-    EXPECT_EQ(GetMemoryUsageForTag(1), 0u);
-    auto allocation1 = MakeAllocation(1 << 5);
-    EXPECT_EQ(GetMemoryUsageForTag(1), 1u << 5);
-    {
-        TMemoryTagGuard guard2(2);
-        auto allocation2 = MakeAllocation(1 << 6);
-        EXPECT_EQ(GetMemoryUsageForTag(1), 1u << 5);
-        EXPECT_EQ(GetMemoryUsageForTag(2), 1u << 6);
-    }
-    EXPECT_EQ(GetMemoryUsageForTag(1), 1u << 5);
-    EXPECT_EQ(GetMemoryUsageForTag(2), 0u);
-    {
-        TMemoryTagGuard guard2(std::move(guard1));
-        auto allocation2 = MakeAllocation(1 << 7);
-        EXPECT_EQ(GetMemoryUsageForTag(1), (1u << 5) + (1u << 7));
-        EXPECT_EQ(GetMemoryUsageForTag(2), 0u);
-    }
-    EXPECT_EQ(GetMemoryUsageForTag(1), (1u << 5));
-    EXPECT_EQ(GetMemoryUsageForTag(2), 0u);
-}
-
-////////////////////////////////////////////////////////////////////////////////
-
-void Action1()
-{
-    TMemoryTagGuard guard(1);
-    Yield();
-    auto allocation1 = MakeAllocation(1 << 5);
-    EXPECT_EQ(GetMemoryUsageForTag(1), 1u << 5);
-    Yield();
-    auto allocation2 = MakeAllocation(1 << 7);
-    EXPECT_EQ(GetMemoryUsageForTag(1), (1u << 5) + (1u << 7));
-    Yield();
-    auto allocation3 = MakeAllocation(1 << 9);
-    EXPECT_EQ(GetMemoryUsageForTag(1), (1u << 5) + (1u << 7) + (1u << 9));
-}
-
-void Action2()
-{
-    TMemoryTagGuard guard(2);
-    Yield();
-    auto allocation1 = MakeAllocation(1 << 6);
-    EXPECT_EQ(GetMemoryUsageForTag(2), 1u << 6);
-    Yield();
-    auto allocation2 = MakeAllocation(1 << 8);
-    EXPECT_EQ(GetMemoryUsageForTag(2), (1u << 6) + (1u << 8));
-    Yield();
-    auto allocation3 = MakeAllocation(1 << 10);
-    EXPECT_EQ(GetMemoryUsageForTag(2), (1u << 6) + (1u << 8) + (1u << 10));
-}
-
-void TestSwitchingFibers()
-{
-    auto future1 = BIND(&Action1)
-        .AsyncVia(GetCurrentInvoker())
-        .Run();
-    auto future2 = BIND(&Action2)
-        .AsyncVia(GetCurrentInvoker())
-        .Run();
-    WaitFor(AllSucceeded(std::vector<TFuture<void>>{future1, future2}))
-        .ThrowOnError();
-    EXPECT_EQ(GetMemoryUsageForTag(1), 0u);
-    EXPECT_EQ(GetMemoryUsageForTag(2), 0u);
-}
-
-////////////////////////////////////////////////////////////////////////////////
-
-class TMiniController
-    : public TRefCounted
-{
-public:
-    TMiniController(IInvokerPtr controlInvoker, TMemoryTag memoryTag)
-        : MemoryTag_(memoryTag)
-        , Invoker_(CreateMemoryTaggingInvoker(CreateSerializedInvoker(std::move(controlInvoker)), MemoryTag_))
-    { }
-
-    ssize_t GetMemoryUsage() const
-    {
-        return GetMemoryUsageForTag(MemoryTag_);
-    }
-
-    IInvokerPtr GetControlInvoker() const
-    {
-        return Invoker_;
-    }
-
-    std::vector<std::vector<char>>& Allocations()
-    {
-        return Allocations_;
-    }
-
-private:
-    TMemoryTag MemoryTag_;
-    IInvokerPtr Invoker_;
-    std::vector<std::vector<char>> Allocations_;
-};
-
-DEFINE_REFCOUNTED_TYPE(TMiniController)
-DECLARE_REFCOUNTED_CLASS(TMiniController)
-
-void Action3(TMiniControllerPtr controller)
-{
-    controller->Allocations().emplace_back(MakeAllocation(128_MB));
-}
-
-void TestMemoryTaggingInvoker()
-{
-    auto queue = New<TActionQueue>();
-    auto controller = New<TMiniController>(queue->GetInvoker(), 1);
-    EXPECT_EQ(controller->GetMemoryUsage(), 0);
-
-    WaitFor(BIND(&Action3, controller)
-        .AsyncVia(controller->GetControlInvoker())
-        .Run())
-        .ThrowOnError();
-    EXPECT_NEAR(controller->GetMemoryUsage(), 128_MB, 1_MB);
-
-    controller->Allocations().clear();
-    controller->Allocations().shrink_to_fit();
-
-    EXPECT_NEAR(GetMemoryUsageForTag(1), 0, 1_MB);
-}
-
-void TestControllersInThreadPool()
-{
-    std::vector<TMiniControllerPtr> controllers;
-    constexpr int controllerCount = 1000;
-    auto pool = CreateThreadPool(16, "TestPool");
-    for (int index = 0; index < controllerCount; ++index) {
-        controllers.emplace_back(New<TMiniController>(pool->GetInvoker(), index + 1));
-    }
-    constexpr int actionCount = 100 * 1000;
-    std::vector<TFuture<void>> futures;
-    std::vector<int> memoryUsages(controllerCount);
-    srand(42);
-    for (int index = 0; index < actionCount; ++index) {
-        int controllerIndex = rand() % controllerCount;
-        auto allocationSize = 1 << (5 + rand() % 10);
-        memoryUsages[controllerIndex] += allocationSize;
-        const auto& controller = controllers[controllerIndex];
-        futures.emplace_back(
-            BIND([] (TMiniControllerPtr controller, int allocationSize) {
-                controller->Allocations().emplace_back(MakeAllocation(allocationSize));
-            }, controller, allocationSize)
-                .AsyncVia(controller->GetControlInvoker())
-                .Run());
-    }
-    WaitFor(AllSucceeded(futures))
-        .ThrowOnError();
-    for (int index = 0; index < controllerCount; ++index) {
-        EXPECT_NEAR(memoryUsages[index], controllers[index]->GetMemoryUsage(), 10_KB);
-    }
-    controllers.clear();
-    for (int index = 0; index < controllerCount; ++index) {
-        EXPECT_NEAR(GetMemoryUsageForTag(index + 1), 0, 10_KB);
-        EXPECT_GE(GetMemoryUsageForTag(index + 1), 0u);
-    }
-}
-
-////////////////////////////////////////////////////////////////////////////////
-
-TEST_P(TMemoryTagTest, Test)
-{
-    // We wrap anything with an outer action queue to make
-    // fiber-friendly environment.
-    auto outerQueue = New<TActionQueue>();
-    WaitFor(BIND(GetParam())
-        .AsyncVia(outerQueue->GetInvoker())
-        .Run())
-        .ThrowOnError();
-}
-
-INSTANTIATE_TEST_SUITE_P(MemoryTagTest, TMemoryTagTest, Values(
-    &TestStackingGuards,
-    &TestSwitchingFibers,
-    &TestMemoryTaggingInvoker,
-    &TestControllersInThreadPool));
-
-////////////////////////////////////////////////////////////////////////////////
-
-} // namespace
-} // namespace NYT
-
-#endif // !defined(_msan_enabled_)
diff --git a/yt/yt/core/misc/unittests/ya.make b/yt/yt/core/misc/unittests/ya.make
index 62fb8ece1357..a42283827075 100644
--- a/yt/yt/core/misc/unittests/ya.make
+++ b/yt/yt/core/misc/unittests/ya.make
@@ -42,7 +42,6 @@ SRCS(
     lock_free_hash_table_ut.cpp
     lru_cache_ut.cpp
     maybe_inf_ut.cpp
-    memory_tag_ut.cpp
     moving_average_ut.cpp
     mpsc_fair_share_queue_ut.cpp
     mpsc_stack_ut.cpp
diff --git a/yt/yt/core/rpc/unittests/rpc_allocation_tags_ut.cpp b/yt/yt/core/rpc/unittests/rpc_allocation_tags_ut.cpp
index ec5c49d7ccab..55dd20bde7e5 100644
--- a/yt/yt/core/rpc/unittests/rpc_allocation_tags_ut.cpp
+++ b/yt/yt/core/rpc/unittests/rpc_allocation_tags_ut.cpp
@@ -33,7 +33,7 @@ TYPED_TEST(TRpcTest, ResponseWithAllocationTags)
     auto previousLimit = memoryUsageTracker->GetLimit();
     memoryUsageTracker->SetLimit(2_GB);
 
-    static TMemoryTag testMemoryTag = 1 << 20;
+    static int testMemoryTag = 1 << 20;
     testMemoryTag++;
 
     EnableMemoryProfilingTags();
@@ -70,11 +70,11 @@ TYPED_TEST(TRpcTest, ResponseWithAllocationTags)
         req2->set_size(size);
 
         auto rspFutureProp = req2->Invoke()
-            .Apply(BIND([testMemoryTag=testMemoryTag] (const TRspPtr& res) {
+            .Apply(BIND([testMemoryTag = testMemoryTag] (const TRspPtr& res) {
                 auto localContext = TryGetCurrentTraceContext();
                 EXPECT_NE(localContext, nullptr);
                 if (localContext) {
-                    EXPECT_EQ(localContext->FindAllocationTag<TMemoryTag>(MemoryAllocationTag).value_or(NullMemoryTag), testMemoryTag);
+                    EXPECT_EQ(localContext->FindAllocationTag<int>(MemoryAllocationTag).value_or(NullMemoryTag), testMemoryTag);
                 }
                 return res;
             }).AsyncVia(actionQueue->GetInvoker()));
diff --git a/yt/yt/core/tracing/unittests/allocation_tags_ut.cpp b/yt/yt/core/tracing/unittests/allocation_tags_ut.cpp
index 9c567309ee22..86a645ea5f7e 100644
--- a/yt/yt/core/tracing/unittests/allocation_tags_ut.cpp
+++ b/yt/yt/core/tracing/unittests/allocation_tags_ut.cpp
@@ -16,7 +16,7 @@ TEST(TAllocationTagsTest, GetSetAllocationTags)
     ASSERT_EQ(traceContext->FindAllocationTag<std::string>("a"), std::nullopt);
 
     traceContext->SetAllocationTags({{"user", "first"}, {"sometag", "my"}});
-    ASSERT_EQ(traceContext->FindAllocationTag<TMemoryTag>("memory_tag"), std::nullopt);
+    ASSERT_EQ(traceContext->FindAllocationTag<int>("memory_tag"), std::nullopt);
     ASSERT_EQ(traceContext->FindAllocationTag<std::string>("user"),  "first");
     ASSERT_EQ(traceContext->FindAllocationTag<std::string>("sometag"),  "my");
     ASSERT_EQ(traceContext->FindAllocationTag<std::string>("other"),  std::nullopt);
@@ -35,8 +35,8 @@ TEST(TAllocationTagsTest, GetSetAllocationTags)
     ASSERT_EQ(traceContext->FindAllocationTag<std::string>("sometag"),  std::nullopt);
     ASSERT_TRUE(traceContext->GetAllocationTags().empty());
 
-    traceContext->SetAllocationTag<TMemoryTag>("memory_tag", TMemoryTag{1});
-    ASSERT_EQ(traceContext->FindAllocationTag<TMemoryTag>("memory_tag"),  TMemoryTag{1});
+    traceContext->SetAllocationTag<int>("memory_tag", 1);
+    ASSERT_EQ(traceContext->FindAllocationTag<int>("memory_tag"),  1);
     ASSERT_FALSE(traceContext->GetAllocationTags().empty());
 }
 
diff --git a/yt/yt/library/ytprof/unittests/heap_profiler_ut.cpp b/yt/yt/library/ytprof/unittests/heap_profiler_ut.cpp
index 6935216130e5..a7a19b8eb93f 100644
--- a/yt/yt/library/ytprof/unittests/heap_profiler_ut.cpp
+++ b/yt/yt/library/ytprof/unittests/heap_profiler_ut.cpp
@@ -64,9 +64,9 @@ TEST(THeapProfilerTest, ReadProfile)
 
     auto h0 = BlowHeap<0>();
 
-    auto tag = TMemoryTag(1);
+    int tag = 1;
     traceContext->SetAllocationTags({{"user", "second"}, {"sometag", "notmy"}, {MemoryAllocationTagKey, ToString(tag)}});
-    auto currentTag = traceContext->FindAllocationTag<TMemoryTag>(MemoryAllocationTagKey);
+    auto currentTag = traceContext->FindAllocationTag<int>(MemoryAllocationTagKey);
     ASSERT_EQ(currentTag, tag);
 
     auto h1 = BlowHeap<1>();
@@ -99,7 +99,7 @@ TEST(THeapProfilerTest, ReadProfile)
     output.Finish();
 }
 
-TEST(THeapProfilerTest, AllocationTagsWithMemoryTag)
+TEST(THeapProfilerTest, AllocationTags)
 {
     EnableMemoryProfilingTags();
     auto traceContext = TTraceContext::NewRoot("Root");
@@ -201,7 +201,7 @@ TEST(THeapProfilerTest, HugeAllocationsTagsWithMemoryTag)
     heap.push_back(BlowHeap<0>());
 
     traceContext->SetAllocationTag(MemoryAllocationTagKey, MemoryAllocationTagValues[1]);
-    ASSERT_EQ(traceContext->FindAllocationTag<TMemoryTag>(MemoryAllocationTagKey), 1);
+    ASSERT_EQ(traceContext->FindAllocationTag<int>(MemoryAllocationTagKey), 1);
 
     heap.push_back(BlowHeap<1>(100));
 

From 21adcc74febab524dedf75a02d887e6f507d0b7e Mon Sep 17 00:00:00 2001
From: alevitskii <alevitskii@yandex-team.com>
Date: Sat, 30 Nov 2024 20:01:31 +0300
Subject: [PATCH 15/16] Support autoincludes in ya style
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

Support autoincludes in ya style

* Сделал конфиги отдельной сущностью, которую стайлеры могут использовать

* Поддержал сущность `AutoincludeConfig`, добавил ее в black, ruff, clang-format стайлеры
commit_hash:948c057433b3247dd84044f7c4743d2fb1d0c336
---
 .../tests/py_style/default_configs.json       |  5 +++--
 build/plugins/lib/test_const/__init__.py      | 19 ++++++++++++++++---
 2 files changed, 19 insertions(+), 5 deletions(-)

diff --git a/build/config/tests/py_style/default_configs.json b/build/config/tests/py_style/default_configs.json
index 1c8f2ab58517..fe67053022b5 100644
--- a/build/config/tests/py_style/default_configs.json
+++ b/build/config/tests/py_style/default_configs.json
@@ -1,6 +1,7 @@
 {
+    "black": "build/config/tests/py_style/config.toml",
+    "dummy_linter": "build/config/tests/py_style/dummy_linter_config.json",
     "flake8": "build/config/tests/flake8/flake8.conf",
     "py2_flake8": "build/config/tests/flake8/flake8.conf",
-    "black": "build/config/tests/py_style/config.toml",
-    "dummy_linter": "build/config/tests/py_style/dummy_linter_config.json"
+    "ruff": "build/config/tests/ruff/ruff.toml"
 }
diff --git a/build/plugins/lib/test_const/__init__.py b/build/plugins/lib/test_const/__init__.py
index 9e4648d9b140..02c3c5313ad0 100644
--- a/build/plugins/lib/test_const/__init__.py
+++ b/build/plugins/lib/test_const/__init__.py
@@ -438,10 +438,11 @@ class ServiceTags(Enum):
 
 
 class PythonLinterName(Enum):
-    Flake8 = "flake8"
-    Py2Flake8 = "py2_flake8"
     Black = "black"
     DummyLinter = "dummy_linter"
+    Flake8 = "flake8"
+    Py2Flake8 = "py2_flake8"
+    Ruff = "ruff"
 
 
 class CppLinterName(Enum):
@@ -449,8 +450,20 @@ class CppLinterName(Enum):
 
 
 class DefaultLinterConfig(Enum):
-    Python = "build/config/tests/py_style/default_configs.json"
     Cpp = "build/config/tests/cpp_style/default_configs.json"
+    Python = "build/config/tests/py_style/default_configs.json"
+
+
+LINTER_CONFIG_TYPES = {
+    CppLinterName.ClangFormat: (".clang-format",),
+    PythonLinterName.Black: ("pyproject.toml",),
+    PythonLinterName.Ruff: ("pyproject.toml", "ruff.toml"),
+}
+
+AUTOINCLUDE_PATHS = (
+    'build/conf/autoincludes.json',
+    'build/internal/conf/autoincludes.json',
+)
 
 
 class Status(object):

From b4cba2872f6348770b561394bef1668f54abfe27 Mon Sep 17 00:00:00 2001
From: Alexander Smirnov <alex@ydb.tech>
Date: Sat, 30 Nov 2024 18:17:06 +0000
Subject: [PATCH 16/16] Import libraries 241130-1815

---
 ydb/ci/rightlib.txt | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/ydb/ci/rightlib.txt b/ydb/ci/rightlib.txt
index 28835e2e6e37..6cb64cb1a943 100644
--- a/ydb/ci/rightlib.txt
+++ b/ydb/ci/rightlib.txt
@@ -1 +1 @@
-56a560baa86b52c66ce622414579975930421950
+21adcc74febab524dedf75a02d887e6f507d0b7e